diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -2017,6 +2017,32 @@ TARGET_BUILTIN(__builtin_ia32_vfmsubsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddcph128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfmaddcph128_maskz, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_maskz, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_maskz, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_maskz, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_maskz, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_maskz, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") + +TARGET_BUILTIN(__builtin_ia32_vfmulcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfcmulcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmulcph128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfmulcph256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfmulcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfcmulcph128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfcmulcph256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfcmulcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") + // generic select intrinsics TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl") TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl") diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -508,6 +508,23 @@ return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A); } +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) { + return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f)); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) { + return (__m512h)__builtin_ia32_selectps_512( + (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) { + return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_conj_pch(__A), + (__v16sf)_mm512_setzero_ps()); +} + static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A, __m128h __B) { __A[0] += __B[0]; @@ -2852,6 +2869,347 @@ (__m128h) __builtin_ia32_vfmsubsh3_mask3( \ (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ (__mmask8)(U), (int)(R)) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__C, (__v4sf)__A, + (__v4sf)__B, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectps_128( + __U, + __builtin_ia32_vfcmaddcsh_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION), + (__v4sf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__C, (__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fcmadd_round_sch(A, B, C, R) \ + (__m128h) __builtin_ia32_vfcmaddcsh_mask( \ + (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__mmask8)-1, (int)(R)) + +#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \ + (__m128h) __builtin_ia32_selectps_128( \ + (__mmask8)(U & 1), \ + __builtin_ia32_vfcmaddcsh_mask( \ + (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__mmask8)(U), (int)(R)), \ + (__v4sf)(__m128h)(A)) + +#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \ + (__m128h) __builtin_ia32_vfcmaddcsh_maskz( \ + (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__mmask8)(U), (int)(R)) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__C, (__v4sf)__A, + (__v4sf)__B, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectps_128( + __U, + __builtin_ia32_vfmaddcsh_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION), + (__v4sf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__C, (__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmadd_round_sch(A, B, C, R) \ + (__m128h) __builtin_ia32_vfmaddcsh_mask( \ + (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__mmask8)-1, (int)(R)) + +#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \ + (__m128h) __builtin_ia32_selectps_128( \ + (__mmask8)(U & 1), \ + __builtin_ia32_vfmaddcsh_mask( \ + (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__mmask8)(U), (int)(R)), \ + (__v4sf)(__m128h)(A)) + +#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \ + (__m128h) __builtin_ia32_vfmaddcsh_maskz( \ + (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__mmask8)(U), (int)(R)) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcsh_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcsh_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fcmul_round_sch(A, B, R) \ + (__m128h) __builtin_ia32_vfcmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)) + +#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \ + (__m128h) __builtin_ia32_vfcmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \ + (__mmask8)(U), (int)(R)) + +#define _mm_maskz_fcmul_round_sch(U, A, B, R) \ + (__m128h) __builtin_ia32_vfcmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcsh_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcsh_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmul_round_sch(A, B, R) \ + (__m128h) __builtin_ia32_vfmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)) + +#define _mm_mask_fmul_round_sch(W, U, A, B, R) \ + (__m128h) __builtin_ia32_vfmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \ + (__mmask8)(U), (int)(R)) + +#define _mm_maskz_fmul_round_sch(U, A, B, R) \ + (__m128h) __builtin_ia32_vfmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_vfcmulcph512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_vfcmulcph512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fcmul_round_pch(A, B, R) \ + (__m512h) __builtin_ia32_vfcmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)) + +#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \ + (__m512h) __builtin_ia32_vfcmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \ + (__mmask16)(U), (int)(R)) + +#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \ + (__m512h) __builtin_ia32_vfcmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_vfmulcph512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_vfmulcph512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fmul_round_pch(A, B, R) \ + (__m512h) __builtin_ia32_vfmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)) + +#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \ + (__m512h) __builtin_ia32_vfmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \ + (__mmask16)(U), (int)(R)) + +#define _mm512_maskz_fmul_round_pch(U, A, B, R) \ + (__m512h) __builtin_ia32_vfmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfcmaddcph512_mask((__v16sf)__C, (__v16sf)__A, + (__v16sf)__B, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_selectps_512( + __U, + __builtin_ia32_vfcmaddcph512_mask((__v16sf)__C, (__v16sf)__A, + (__v16sf)__B, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION), + (__v16sf)__A); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { + return (__m512h)__builtin_ia32_vfcmaddcph512_mask( + (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfcmaddcph512_maskz( + (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fcmadd_round_pch(A, B, C, R) \ + (__m512h) __builtin_ia32_vfcmaddcph512_mask( \ + (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__mmask16)-1, (int)(R)) + +#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \ + (__m512h)(__m512h) __builtin_ia32_selectps_512( \ + (__mmask16)(U), \ + __builtin_ia32_vfcmaddcph512_mask( \ + (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__mmask16)(U), (int)(R)), \ + (__v16sf)(__m512h)(A)) + +#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \ + (__m512h) __builtin_ia32_vfcmaddcph512_mask( \ + (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__mmask16)(U), (int)(R)) + +#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \ + (__m512h) __builtin_ia32_vfcmaddcph512_maskz( \ + (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__mmask16)(U), (int)(R)) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A, + (__v16sf)__B, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_selectps_512( + __U, + __builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A, (__v16sf)__B, + (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION), + (__v16sf)__A); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { + return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A, + (__v16sf)__B, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddcph512_maskz( + (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fmadd_round_pch(A, B, C, R) \ + (__m512h) __builtin_ia32_vfmaddcph512_mask( \ + (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__mmask16)-1, (int)(R)) + +#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \ + (__m512h)(__m512h) __builtin_ia32_selectps_512( \ + (__mmask16)(U), \ + __builtin_ia32_vfmaddcph512_mask( \ + (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__mmask16)(U), (int)(R)), \ + (__v16sf)(__m512h)(A)) + +#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \ + (__m512h) __builtin_ia32_vfmaddcph512_mask( \ + (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__mmask16)(U), (int)(R)) + +#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \ + (__m512h) __builtin_ia32_vfmaddcph512_maskz( \ + (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__mmask16)(U), (int)(R)) + #define _mm512_mask_reduce_operator(op) \ __m256h __t1 = (__m256h)_mm512_extractf64x4_pd((__m512d)__W, 0); \ __m256h __t2 = (__m256h)_mm512_extractf64x4_pd((__m512d)__W, 1); \ diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -312,6 +312,39 @@ return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); } +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) { + return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f)); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) { + return (__m256h)__builtin_ia32_selectps_256( + (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) { + return (__m256h)__builtin_ia32_selectps_256( + (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) { + return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f)); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W, + __mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_selectps_128( + (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_conj_pch(__mmask8 __U, __m128h __A) { + return (__m128h)__builtin_ia32_selectps_128( + (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps()); +} + #define _mm256_cmp_ph_mask(a, b, p) \ (__mmask16) __builtin_ia32_cmpph256_mask( \ (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1) @@ -1744,6 +1777,192 @@ (__v16hf)__C); } +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcph128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcph128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_vfcmulcph256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_vfcmulcph256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)__A, + (__v4sf)__B, (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectps_128( + __U, + __builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)(__m128h)__A, + (__v4sf)__B, (__mmask8)__U), + (__v4sf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)__A, + (__v4sf)__B, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfcmaddcph128_maskz( + (__v4sf)__C, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A, + (__v8sf)__B, (__mmask8)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectps_256( + __U, + __builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A, (__v8sf)__B, + (__mmask8)__U), + (__v8sf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { + return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A, + (__v8sf)__B, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_vfcmaddcph256_maskz( + (__v8sf)__C, (__v8sf)__A, (__v8sf)__B, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcph128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcph128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_vfmulcph256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_vfmulcph256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A, + (__v4sf)__B, (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectps_128( + __U, + __builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B, + (__mmask8)__U), + (__v4sf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A, + (__v4sf)__B, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__C, (__v4sf)__A, + (__v4sf)__B, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A, + (__v8sf)__B, (__mmask8)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectps_256( + __U, + __builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A, (__v8sf)__B, + (__mmask8)__U), + (__v8sf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { + return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A, + (__v8sf)__B, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__C, (__v8sf)__A, + (__v8sf)__B, (__mmask8)__U); +} + static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3978,6 +3978,16 @@ case X86::BI__builtin_ia32_vfmaddsubph512_maskz: case X86::BI__builtin_ia32_vfmaddsubph512_mask3: case X86::BI__builtin_ia32_vfmsubaddph512_mask3: + case X86::BI__builtin_ia32_vfmaddcsh_mask: + case X86::BI__builtin_ia32_vfmaddcph512_mask: + case X86::BI__builtin_ia32_vfmaddcph512_maskz: + case X86::BI__builtin_ia32_vfcmaddcsh_mask: + case X86::BI__builtin_ia32_vfcmaddcph512_mask: + case X86::BI__builtin_ia32_vfcmaddcph512_maskz: + case X86::BI__builtin_ia32_vfmulcsh_mask: + case X86::BI__builtin_ia32_vfmulcph512_mask: + case X86::BI__builtin_ia32_vfcmulcsh_mask: + case X86::BI__builtin_ia32_vfcmulcph512_mask: ArgNum = 4; HasRC = true; break; diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c --- a/clang/test/CodeGen/X86/avx512fp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -656,6 +656,48 @@ return _mm512_abs_ph(a); } +__m512h test_mm512_conj_pch(__m512h __A) { + // CHECK-LABEL: @test_mm512_conj_pch + // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float> + // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> + // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> + // CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float> + // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> + return _mm512_conj_pch(__A); +} + +__m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_conj_pch + // CHECK: %{{.*}} = trunc i32 %{{.*}} to i16 + // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float> + // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> + // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> + // CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float> + // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> + // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float> + // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> + return _mm512_mask_conj_pch(__W, __U, __A); +} + +__m512h test_mm512_maskz_conj_pch(__mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_maskz_conj_pch + // CHECK: %{{.*}} = trunc i32 %{{.*}} to i16 + // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float> + // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> + // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> + // CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float> + // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> + // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> + return _mm512_maskz_conj_pch(__U, __A); +} + __m128h test_mm_add_round_sh(__m128h __A, __m128h __B) { // CHECK-LABEL: @test_mm_add_round_sh // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round @@ -3994,6 +4036,346 @@ return _mm_mask3_fnmsub_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } +__m128h test_mm_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fcmadd_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.sh + return _mm_fcmadd_sch(__A, __B, __C); +} + +__m128h test_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fcmadd_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.sh + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} + return _mm_mask_fcmadd_sch(__A, __U, __B, __C); +} + +__m128h test_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fcmadd_sch + // CHECK: @llvm.x86.avx512fp16.maskz.vfcmaddc.sh + return _mm_maskz_fcmadd_sch(__U, __A, __B, __C); +} + +__m128h test_mm_fcmadd_round_sch(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fcmadd_round_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.sh + return _mm_fcmadd_round_sch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask_fcmadd_round_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fcmadd_round_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.sh + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} + return _mm_mask_fcmadd_round_sch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_fcmadd_round_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fcmadd_round_sch + // CHECK: @llvm.x86.avx512fp16.maskz.vfcmaddc.sh + return _mm_maskz_fcmadd_round_sch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_fmadd_sch(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fmadd_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.sh + return _mm_fmadd_sch(__A, __B, __C); +} + +__m128h test_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fmadd_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.sh + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} + return _mm_mask_fmadd_sch(__A, __U, __B, __C); +} + +__m128h test_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fmadd_sch + // CHECK: @llvm.x86.avx512fp16.maskz.vfmaddc.sh + return _mm_maskz_fmadd_sch(__U, __A, __B, __C); +} + +__m128h test_mm_fmadd_round_sch(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fmadd_round_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.sh + return _mm_fmadd_round_sch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask_fmadd_round_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fmadd_round_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.sh + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} + return _mm_mask_fmadd_round_sch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_fmadd_round_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fmadd_round_sch + // CHECK: @llvm.x86.avx512fp16.maskz.vfmaddc.sh + return _mm_maskz_fmadd_round_sch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +// CFC MUL SH + +__m128h test_mm_fcmul_sch(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_fcmul_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.sh + return _mm_fcmul_sch(__A, __B); +} + +__m128h test_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fcmul_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.sh + return _mm_mask_fcmul_sch(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_fcmul_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.sh + return _mm_maskz_fcmul_sch(__U, __A, __B); +} + +__m128h test_mm_fcmul_round_sch(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_fcmul_round_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.sh + return _mm_fcmul_round_sch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask_fcmul_round_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fcmul_round_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.sh + return _mm_mask_fcmul_round_sch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_fcmul_round_sch(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_fcmul_round_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.sh + return _mm_maskz_fcmul_round_sch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +// CFC MUL PH + +__m512h test_mm512_fcmul_pch(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_fcmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.512 + return _mm512_fcmul_pch(__A, __B); +} + +__m512h test_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_fcmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.512 + return _mm512_mask_fcmul_pch(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_fcmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.512 + return _mm512_maskz_fcmul_pch(__U, __A, __B); +} + +__m512h test_mm512_fcmul_round_pch(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_fcmul_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.512 + return _mm512_fcmul_round_pch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_fcmul_round_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_fcmul_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.512 + return _mm512_mask_fcmul_round_pch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_fcmul_round_pch(__mmask16 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_fcmul_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.512 + return _mm512_maskz_fcmul_round_pch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +// CFC ADD PH + +__m512h test_mm512_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512 + return _mm512_fcmadd_pch(__A, __B, __C); +} + +__m512h test_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512 + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + return _mm512_mask_fcmadd_pch(__A, __U, __B, __C); +} + +__m512h test_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm512_mask3_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512 + // CHECK-NOT: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + return _mm512_mask3_fcmadd_pch(__A, __B, __C, __U); +} + +__m512h test_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.512 + return _mm512_maskz_fcmadd_pch(__U, __A, __B, __C); +} + +__m512h test_mm512_fcmadd_round_pch(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fcmadd_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512 + return _mm512_fcmadd_round_pch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_fcmadd_round_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fcmadd_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512 + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + return _mm512_mask_fcmadd_round_pch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask3_fcmadd_round_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm512_mask3_fcmadd_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512 + // CHECK-NOT: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + return _mm512_mask3_fcmadd_round_pch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_fcmadd_round_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fcmadd_round_pch + // CHECK: @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.512 + return _mm512_maskz_fcmadd_round_pch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +// CF MUL PH + +__m512h test_mm512_fmul_pch(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_fmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.512 + return _mm512_fmul_pch(__A, __B); +} + +__m512h test_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_fmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.512 + return _mm512_mask_fmul_pch(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_fmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.512 + return _mm512_maskz_fmul_pch(__U, __A, __B); +} + +__m512h test_mm512_fmul_round_pch(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_fmul_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.512 + return _mm512_fmul_round_pch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_fmul_round_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_fmul_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.512 + return _mm512_mask_fmul_round_pch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_fmul_round_pch(__mmask16 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_fmul_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.512 + return _mm512_maskz_fmul_round_pch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +// CF ADD PH + +__m512h test_mm512_fmadd_pch(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.512 + return _mm512_fmadd_pch(__A, __B, __C); +} + +__m512h test_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.512 + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + return _mm512_mask_fmadd_pch(__A, __U, __B, __C); +} + +__m512h test_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.512 + // CHECK-NOT: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + return _mm512_mask3_fmadd_pch(__A, __B, __C, __U); +} + +__m512h test_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.maskz.vfmaddc.ph.512 + return _mm512_maskz_fmadd_pch(__U, __A, __B, __C); +} + +__m512h test_mm512_fmadd_round_pch(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fmadd_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.512 + return _mm512_fmadd_round_pch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_fmadd_round_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fmadd_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.512 + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + return _mm512_mask_fmadd_round_pch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask3_fmadd_round_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmadd_round_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.512 + // CHECK-NOT: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + return _mm512_mask3_fmadd_round_pch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_fmadd_round_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fmadd_round_pch + // CHECK: @llvm.x86.avx512fp16.maskz.vfmaddc.ph.512 + return _mm512_maskz_fmadd_round_pch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +// CF MUL SH + +__m128h test_mm_fmul_sch(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_fmul_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.sh + return _mm_fmul_sch(__A, __B); +} + +__m128h test_mm_mask_fmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fmul_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.sh + return _mm_mask_fmul_sch(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_fmul_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.sh + return _mm_maskz_fmul_sch(__U, __A, __B); +} + +__m128h test_mm_fmul_round_sch(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_fmul_round_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.sh + return _mm_fmul_round_sch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask_fmul_round_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fmul_round_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.sh + return _mm_mask_fmul_round_sch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_fmul_round_sch(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_fmul_round_sch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.sh + return _mm_maskz_fmul_round_sch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + _Float16 test_mm512_reduce_add_ph(__m512h __W) { // CHECK-LABEL: @test_mm512_reduce_add_ph // CHECK: %{{.*}} = shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c --- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c @@ -383,6 +383,92 @@ return _mm256_abs_ph(a); } +__m256h test_mm256_conj_pch(__m256h __A) { + // CHECK-LABEL: @test_mm256_conj_pch + // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float> + // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> + // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> + // CHECK: %{{.*}} = xor <8 x i32> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = bitcast <8 x i32> %{{.*}} to <8 x float> + // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> + return _mm256_conj_pch(__A); +} + +__m256h test_mm256_mask_conj_pch(__m256h __W, __mmask32 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_mask_conj_pch + // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 + // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float> + // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> + // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> + // CHECK: %{{.*}} = xor <8 x i32> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = bitcast <8 x i32> %{{.*}} to <8 x float> + // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> + // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float> + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} + // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> + return _mm256_mask_conj_pch(__W, __U, __A); +} + +__m256h test_mm256_maskz_conj_pch(__mmask32 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_maskz_conj_pch + // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 + // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float> + // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> + // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> + // CHECK: %{{.*}} = xor <8 x i32> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = bitcast <8 x i32> %{{.*}} to <8 x float> + // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} + // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> + return _mm256_maskz_conj_pch(__U, __A); +} + +__m128h test_mm_conj_pch(__m128h __A) { + // CHECK-LABEL: @test_mm_conj_pch + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> + // CHECK: %{{.*}} = xor <4 x i32> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = bitcast <4 x i32> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> + return _mm_conj_pch(__A); +} + +__m128h test_mm_mask_conj_pch(__m128h __W, __mmask32 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_mask_conj_pch + // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> + // CHECK: %{{.*}} = xor <4 x i32> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = bitcast <4 x i32> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> + return _mm_mask_conj_pch(__W, __U, __A); +} + +__m128h test_mm_maskz_conj_pch(__mmask32 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_maskz_conj_pch + // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> + // CHECK: %{{.*}} = xor <4 x i32> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = bitcast <4 x i32> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> + return _mm_maskz_conj_pch(__U, __A); +} + __mmask16 test_mm256_cmp_ph_mask_eq_oq(__m256h a, __m256h b) { // CHECK-LABEL: @test_mm256_cmp_ph_mask_eq_oq // CHECK: fcmp oeq <16 x half> %{{.*}}, %{{.*}} @@ -2726,6 +2812,183 @@ // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) return _mm256_mask3_fnmsub_ph(__A, __B, __C, __U); } + +__m128h test_mm_fcmul_pch(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_fcmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.128 + return _mm_fcmul_pch(__A, __B); +} + +__m128h test_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fcmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.128 + return _mm_mask_fcmul_pch(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_fcmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.128 + return _mm_maskz_fcmul_pch(__U, __A, __B); +} + +__m256h test_mm256_fcmul_pch(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_fcmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.256 + return _mm256_fcmul_pch(__A, __B); +} + +__m256h test_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_fcmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.256 + return _mm256_mask_fcmul_pch(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_fcmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmulc.ph.256 + return _mm256_maskz_fcmul_pch(__U, __A, __B); +} + +__m128h test_mm_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.128 + return _mm_fcmadd_pch(__A, __B, __C); +} + +__m128h test_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.128 + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} + return _mm_mask_fcmadd_pch(__A, __U, __B, __C); +} + +__m128h test_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.128 + // CHECK-NOT: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} + return _mm_mask3_fcmadd_pch(__A, __B, __C, __U); +} + +__m128h test_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.128 + return _mm_maskz_fcmadd_pch(__U, __A, __B, __C); +} + +__m256h test_mm256_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.256 + return _mm256_fcmadd_pch(__A, __B, __C); +} + +__m256h test_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.256 + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} + return _mm256_mask_fcmadd_pch(__A, __U, __B, __C); +} + +__m256h test_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm256_mask3_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfcmaddc.ph.256 + // CHECK-NOT: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} + return _mm256_mask3_fcmadd_pch(__A, __B, __C, __U); +} + +__m256h test_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_maskz_fcmadd_pch + // CHECK: @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.256 + return _mm256_maskz_fcmadd_pch(__U, __A, __B, __C); +} + +__m128h test_mm_fmul_pch(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_fmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.128 + return _mm_fmul_pch(__A, __B); +} + +__m128h test_mm_mask_fmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.128 + return _mm_mask_fmul_pch(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_fmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.128 + return _mm_maskz_fmul_pch(__U, __A, __B); +} + +__m256h test_mm256_fmul_pch(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_fmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.256 + return _mm256_fmul_pch(__A, __B); +} + +__m256h test_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_fmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.256 + return _mm256_mask_fmul_pch(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_fmul_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmulc.ph.256 + return _mm256_maskz_fmul_pch(__U, __A, __B); +} + +__m128h test_mm_fmadd_pch(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.128 + return _mm_fmadd_pch(__A, __B, __C); +} + +__m128h test_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.128 + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} + return _mm_mask_fmadd_pch(__A, __U, __B, __C); +} + +__m128h test_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.128 + return _mm_mask3_fmadd_pch(__A, __B, __C, __U); +} + +__m128h test_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.maskz.vfmaddc.ph.128 + return _mm_maskz_fmadd_pch(__U, __A, __B, __C); +} + +__m256h test_mm256_fmadd_pch(__m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.256 + return _mm256_fmadd_pch(__A, __B, __C); +} + +__m256h test_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.256 + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} + return _mm256_mask_fmadd_pch(__A, __U, __B, __C); +} + +__m256h test_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm256_mask3_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.mask.vfmaddc.ph.256 + return _mm256_mask3_fmadd_pch(__A, __B, __C, __U); +} + +__m256h test_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_maskz_fmadd_pch + // CHECK: @llvm.x86.avx512fp16.maskz.vfmaddc.ph.256 + return _mm256_maskz_fmadd_pch(__U, __A, __B, __C); +} + __m128h test_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { // CHECK-LABEL: @test_mm_mask_blend_ph // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5728,4 +5728,137 @@ : Intrinsic<[ llvm_half_ty ], [ llvm_half_ty, llvm_half_ty, llvm_half_ty, llvm_i32_ty ], [ IntrNoMem, ImmArg> ]>; + + def int_x86_avx512fp16_mask_vfcmaddc_ph_128 + : GCCBuiltin<"__builtin_ia32_vfcmaddcph128_mask">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_maskz_vfcmaddc_ph_128 + : GCCBuiltin<"__builtin_ia32_vfcmaddcph128_maskz">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vfcmaddc_ph_256 + : GCCBuiltin<"__builtin_ia32_vfcmaddcph256_mask">, + Intrinsic<[ llvm_v8f32_ty ], + [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_maskz_vfcmaddc_ph_256 + : GCCBuiltin<"__builtin_ia32_vfcmaddcph256_maskz">, + Intrinsic<[ llvm_v8f32_ty ], + [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vfcmaddc_ph_512 + : GCCBuiltin<"__builtin_ia32_vfcmaddcph512_mask">, + Intrinsic<[ llvm_v16f32_ty ], + [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_maskz_vfcmaddc_ph_512 + : GCCBuiltin<"__builtin_ia32_vfcmaddcph512_maskz">, + Intrinsic<[ llvm_v16f32_ty ], + [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vfmaddc_ph_128 + : GCCBuiltin<"__builtin_ia32_vfmaddcph128_mask">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_maskz_vfmaddc_ph_128 + : GCCBuiltin<"__builtin_ia32_vfmaddcph128_maskz">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vfmaddc_ph_256 + : GCCBuiltin<"__builtin_ia32_vfmaddcph256_mask">, + Intrinsic<[ llvm_v8f32_ty ], + [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_maskz_vfmaddc_ph_256 + : GCCBuiltin<"__builtin_ia32_vfmaddcph256_maskz">, + Intrinsic<[ llvm_v8f32_ty ], + [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vfmaddc_ph_512 + : GCCBuiltin<"__builtin_ia32_vfmaddcph512_mask">, + Intrinsic<[ llvm_v16f32_ty ], + [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_maskz_vfmaddc_ph_512 + : GCCBuiltin<"__builtin_ia32_vfmaddcph512_maskz">, + Intrinsic<[ llvm_v16f32_ty ], + [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vfmaddc_sh + : GCCBuiltin<"__builtin_ia32_vfmaddcsh_mask">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_maskz_vfmaddc_sh + : GCCBuiltin<"__builtin_ia32_vfmaddcsh_maskz">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vfcmaddc_sh + : GCCBuiltin<"__builtin_ia32_vfcmaddcsh_mask">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_maskz_vfcmaddc_sh + : GCCBuiltin<"__builtin_ia32_vfcmaddcsh_maskz">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vfmulc_ph_128 + : GCCBuiltin<"__builtin_ia32_vfmulcph128_mask">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vfcmulc_ph_128 + : GCCBuiltin<"__builtin_ia32_vfcmulcph128_mask">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vfmulc_ph_256 + : GCCBuiltin<"__builtin_ia32_vfmulcph256_mask">, + Intrinsic<[ llvm_v8f32_ty ], + [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vfcmulc_ph_256 + : GCCBuiltin<"__builtin_ia32_vfcmulcph256_mask">, + Intrinsic<[ llvm_v8f32_ty ], + [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vfmulc_ph_512 + : GCCBuiltin<"__builtin_ia32_vfmulcph512_mask">, + Intrinsic<[ llvm_v16f32_ty ], + [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vfcmulc_ph_512 + : GCCBuiltin<"__builtin_ia32_vfcmulcph512_mask">, + Intrinsic<[ llvm_v16f32_ty ], + [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vfmulc_sh + : GCCBuiltin<"__builtin_ia32_vfmulcsh_mask">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vfcmulc_sh + : GCCBuiltin<"__builtin_ia32_vfcmulcsh_mask">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; } diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -3865,6 +3865,176 @@ } break; } + case X86::VFCMADDCPHZ128m: + case X86::VFCMADDCPHZ256m: + case X86::VFCMADDCPHZm: + case X86::VFCMADDCPHZ128mb: + case X86::VFCMADDCPHZ256mb: + case X86::VFCMADDCPHZmb: + case X86::VFCMADDCPHZ128mbk: + case X86::VFCMADDCPHZ256mbk: + case X86::VFCMADDCPHZmbk: + case X86::VFCMADDCPHZ128mbkz: + case X86::VFCMADDCPHZ256mbkz: + case X86::VFCMADDCPHZmbkz: + case X86::VFCMADDCPHZ128mk: + case X86::VFCMADDCPHZ256mk: + case X86::VFCMADDCPHZmk: + case X86::VFCMADDCPHZ128mkz: + case X86::VFCMADDCPHZ256mkz: + case X86::VFCMADDCPHZmkz: + case X86::VFCMADDCPHZ128r: + case X86::VFCMADDCPHZ256r: + case X86::VFCMADDCPHZr: + case X86::VFCMADDCPHZ128rk: + case X86::VFCMADDCPHZ256rk: + case X86::VFCMADDCPHZrk: + case X86::VFCMADDCPHZ128rkz: + case X86::VFCMADDCPHZ256rkz: + case X86::VFCMADDCPHZrkz: + case X86::VFCMADDCPHZrb: + case X86::VFCMADDCPHZrbk: + case X86::VFCMADDCPHZrbkz: + case X86::VFCMADDCSHZm: + case X86::VFCMADDCSHZmk: + case X86::VFCMADDCSHZmkz: + case X86::VFCMADDCSHZr: + case X86::VFCMADDCSHZrb: + case X86::VFCMADDCSHZrbk: + case X86::VFCMADDCSHZrbkz: + case X86::VFCMADDCSHZrk: + case X86::VFCMADDCSHZrkz: + case X86::VFMADDCPHZ128m: + case X86::VFMADDCPHZ256m: + case X86::VFMADDCPHZm: + case X86::VFMADDCPHZ128mb: + case X86::VFMADDCPHZ256mb: + case X86::VFMADDCPHZmb: + case X86::VFMADDCPHZ128mbk: + case X86::VFMADDCPHZ256mbk: + case X86::VFMADDCPHZmbk: + case X86::VFMADDCPHZ128mbkz: + case X86::VFMADDCPHZ256mbkz: + case X86::VFMADDCPHZmbkz: + case X86::VFMADDCPHZ128mk: + case X86::VFMADDCPHZ256mk: + case X86::VFMADDCPHZmk: + case X86::VFMADDCPHZ128mkz: + case X86::VFMADDCPHZ256mkz: + case X86::VFMADDCPHZmkz: + case X86::VFMADDCPHZ128r: + case X86::VFMADDCPHZ256r: + case X86::VFMADDCPHZr: + case X86::VFMADDCPHZ128rk: + case X86::VFMADDCPHZ256rk: + case X86::VFMADDCPHZrk: + case X86::VFMADDCPHZ128rkz: + case X86::VFMADDCPHZ256rkz: + case X86::VFMADDCPHZrkz: + case X86::VFMADDCPHZrb: + case X86::VFMADDCPHZrbk: + case X86::VFMADDCPHZrbkz: + case X86::VFMADDCSHZm: + case X86::VFMADDCSHZmk: + case X86::VFMADDCSHZmkz: + case X86::VFMADDCSHZr: + case X86::VFMADDCSHZrb: + case X86::VFMADDCSHZrbk: + case X86::VFMADDCSHZrbkz: + case X86::VFMADDCSHZrk: + case X86::VFMADDCSHZrkz: { + unsigned Dest = Inst.getOperand(0).getReg(); + for (unsigned i = 2; i < Inst.getNumOperands(); i++) + if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg()) + return Warning(Ops[0]->getStartLoc(), "Destination register should be " + "distinct from source registers"); + break; + } + case X86::VFCMULCPHZ128rm: + case X86::VFCMULCPHZ256rm: + case X86::VFCMULCPHZrm: + case X86::VFCMULCPHZ128rmb: + case X86::VFCMULCPHZ256rmb: + case X86::VFCMULCPHZrmb: + case X86::VFCMULCPHZ128rmbk: + case X86::VFCMULCPHZ256rmbk: + case X86::VFCMULCPHZrmbk: + case X86::VFCMULCPHZ128rmbkz: + case X86::VFCMULCPHZ256rmbkz: + case X86::VFCMULCPHZrmbkz: + case X86::VFCMULCPHZ128rmk: + case X86::VFCMULCPHZ256rmk: + case X86::VFCMULCPHZrmk: + case X86::VFCMULCPHZ128rmkz: + case X86::VFCMULCPHZ256rmkz: + case X86::VFCMULCPHZrmkz: + case X86::VFCMULCPHZ128rr: + case X86::VFCMULCPHZ256rr: + case X86::VFCMULCPHZrr: + case X86::VFCMULCPHZ128rrk: + case X86::VFCMULCPHZ256rrk: + case X86::VFCMULCPHZrrk: + case X86::VFCMULCPHZ128rrkz: + case X86::VFCMULCPHZ256rrkz: + case X86::VFCMULCPHZrrkz: + case X86::VFCMULCPHZrrb: + case X86::VFCMULCPHZrrbk: + case X86::VFCMULCPHZrrbkz: + case X86::VFCMULCSHZrm: + case X86::VFCMULCSHZrmk: + case X86::VFCMULCSHZrmkz: + case X86::VFCMULCSHZrr: + case X86::VFCMULCSHZrrb: + case X86::VFCMULCSHZrrbk: + case X86::VFCMULCSHZrrbkz: + case X86::VFCMULCSHZrrk: + case X86::VFCMULCSHZrrkz: + case X86::VFMULCPHZ128rm: + case X86::VFMULCPHZ256rm: + case X86::VFMULCPHZrm: + case X86::VFMULCPHZ128rmb: + case X86::VFMULCPHZ256rmb: + case X86::VFMULCPHZrmb: + case X86::VFMULCPHZ128rmbk: + case X86::VFMULCPHZ256rmbk: + case X86::VFMULCPHZrmbk: + case X86::VFMULCPHZ128rmbkz: + case X86::VFMULCPHZ256rmbkz: + case X86::VFMULCPHZrmbkz: + case X86::VFMULCPHZ128rmk: + case X86::VFMULCPHZ256rmk: + case X86::VFMULCPHZrmk: + case X86::VFMULCPHZ128rmkz: + case X86::VFMULCPHZ256rmkz: + case X86::VFMULCPHZrmkz: + case X86::VFMULCPHZ128rr: + case X86::VFMULCPHZ256rr: + case X86::VFMULCPHZrr: + case X86::VFMULCPHZ128rrk: + case X86::VFMULCPHZ256rrk: + case X86::VFMULCPHZrrk: + case X86::VFMULCPHZ128rrkz: + case X86::VFMULCPHZ256rrkz: + case X86::VFMULCPHZrrkz: + case X86::VFMULCPHZrrb: + case X86::VFMULCPHZrrbk: + case X86::VFMULCPHZrrbkz: + case X86::VFMULCSHZrm: + case X86::VFMULCSHZrmk: + case X86::VFMULCSHZrmkz: + case X86::VFMULCSHZrr: + case X86::VFMULCSHZrrb: + case X86::VFMULCSHZrrbk: + case X86::VFMULCSHZrrbkz: + case X86::VFMULCSHZrrk: + case X86::VFMULCSHZrrkz: { + unsigned Dest = Inst.getOperand(0).getReg(); + for (unsigned i = 1; i < Inst.getNumOperands(); i++) + if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg()) + return Warning(Ops[0]->getStartLoc(), "Destination register should be " + "distinct from source registers"); + break; + } } const MCInstrDesc &MCID = MII.get(Inst.getOpcode()); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -569,6 +569,27 @@ FMADDSUB_RND, FMSUBADD_RND, + // AVX-512-FP16 complex addition and multiplication + VFMADDC, + VFMADDC_RND, + VFCMADDC, + VFCMADDC_RND, + + VFMULC, + VFMULC_RND, + VFCMULC, + VFCMULC_RND, + + VFMADDCSH, + VFMADDCSH_RND, + VFCMADDCSH, + VFCMADDCSH_RND, + + VFMULCSH, + VFMULCSH_RND, + VFCMULCSH, + VFCMULCSH_RND, + // Compress and expand. COMPRESS, EXPAND, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25741,6 +25741,35 @@ // Swap Src1 and Src2 in the node creation return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1); } + case FMA_OP_MASKZ: + case FMA_OP_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + MVT VT = Op.getSimpleValueType(); + + SDValue PassThru = Src1; + if (IntrData->Type == FMA_OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + + // We add rounding mode to the Node when + // - RC Opcode is specified and + // - RC is not "current direction". + SDValue NewOp; + if (IntrData->Opc1 != 0) { + SDValue Rnd = Op.getOperand(5); + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3, + DAG.getTargetConstant(RC, dl, MVT::i32)); + else if (!isRoundModeCurDirection(Rnd)) + return SDValue(); + } + if (!NewOp) + NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3); + return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG); + } case IFMA_OP: // NOTE: We need to swizzle the operands to pass the multiply operands // first. @@ -31970,6 +31999,22 @@ NODE_NAME_CASE(FNMSUB_RND) NODE_NAME_CASE(FMADDSUB_RND) NODE_NAME_CASE(FMSUBADD_RND) + NODE_NAME_CASE(VFMADDC) + NODE_NAME_CASE(VFMADDC_RND) + NODE_NAME_CASE(VFCMADDC) + NODE_NAME_CASE(VFCMADDC_RND) + NODE_NAME_CASE(VFMULC) + NODE_NAME_CASE(VFMULC_RND) + NODE_NAME_CASE(VFCMULC) + NODE_NAME_CASE(VFCMULC_RND) + NODE_NAME_CASE(VFMULCSH) + NODE_NAME_CASE(VFMULCSH_RND) + NODE_NAME_CASE(VFCMULCSH) + NODE_NAME_CASE(VFCMULCSH_RND) + NODE_NAME_CASE(VFMADDCSH) + NODE_NAME_CASE(VFMADDCSH_RND) + NODE_NAME_CASE(VFCMADDCSH) + NODE_NAME_CASE(VFCMADDCSH_RND) NODE_NAME_CASE(VPMADD52H) NODE_NAME_CASE(VPMADD52L) NODE_NAME_CASE(VRNDSCALE) @@ -46855,7 +46900,7 @@ switch (Opcode) { case ISD::FADD: - case ISD::FSUB: + case ISD::FSUB: { if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { SDValue LHS = N->getOperand(0); @@ -46870,7 +46915,55 @@ return HorizBinOp; } } + // Try to combine the following nodes + // t21: v16f32 = X86ISD::VFMULC/VFCMULC t7, t8 + // t15: v32f16 = bitcast t21 + // t16: v32f16 = fadd nnan ninf nsz arcp contract afn reassoc t15, t2 + // into X86ISD::VFMADDC/VFCMADDC if possible: + // t22: v16f32 = bitcast t2 + // t23: v16f32 = nnan ninf nsz arcp contract afn reassoc + // X86ISD::VFMADDC/VFCMADDC t7, t8, t22 + // t24: v32f16 = bitcast t23 + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + auto getMulId = [&]() { + if (LHS->getOpcode() == ISD::BITCAST && LHS.hasOneUse() && + (LHS->getOperand(0)->getOpcode() == X86ISD::VFMULC || + LHS->getOperand(0)->getOpcode() == X86ISD::VFCMULC) && + LHS->getOperand(0).hasOneUse()) + return 0; + if (RHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse() && + (RHS->getOperand(0)->getOpcode() == X86ISD::VFMULC || + RHS->getOperand(0)->getOpcode() == X86ISD::VFCMULC) && + RHS->getOperand(0).hasOneUse()) + return 1; + return 2; + }; + int MulId = getMulId(); + const TargetOptions &Options = DAG.getTarget().Options; + if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && + MulId < 2 && Subtarget.hasFP16() && IsAdd && + (VT == MVT::v32f16 || VT == MVT::v16f16 || VT == MVT::v8f16)) { + SDValue FAddOp1 = N->getOperand(1 - MulId); + SDValue MULC = N->getOperand(MulId)->getOperand(0); + MVT ComplexType = + MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2); + if ((MULC->getOpcode() == X86ISD::VFMULC || + MULC->getOpcode() == X86ISD::VFCMULC) && + MULC.hasOneUse() && MULC->getValueType(0) == ComplexType) { + SelectionDAG::FlagInserter FlagsInserter(DAG, N); + FAddOp1 = DAG.getBitcast(ComplexType, FAddOp1); + SDValue FMAddC = + DAG.getNode(MULC->getOpcode() == X86ISD::VFMULC ? X86ISD::VFMADDC + : X86ISD::VFCMADDC, + SDLoc(N), ComplexType, FAddOp1, MULC.getOperand(0), + MULC.getOperand(1)); + SDValue Res = DAG.getBitcast(VT, FMAddC); + return Res; + } + } break; + } case ISD::ADD: case ISD::SUB: if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 || @@ -46897,6 +46990,77 @@ return SDValue(); } +// Try to combine the following nodes +// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64 +// 0 +// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD +// <(load 4 from constant-pool)> t0, t29 +// [t30: v16i32 = bitcast t27] +// t6: v16i32 = xor t7, t27[t30] +// t11: v16f32 = bitcast t6 +// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8 +// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible: +// t22: v16f32 = bitcast t7 +// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] +// t8, t22 +// t24: v32f16 = bitcast t23 +static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + int CombineOpcode = + N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC; + auto isConjugationConstant = [](const Constant *c) { + if (const auto *CI = dyn_cast(c)) { + APInt ConjugationInt32 = APInt(32, 0x80000000, true); + APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true); + switch (CI->getBitWidth()) { + case 16: + return false; + case 32: + return CI->getValue() == ConjugationInt32; + case 64: + return CI->getValue() == ConjugationInt64; + default: + llvm_unreachable("Unexpected bit width"); + } + } + if (const auto *CF = dyn_cast(c)) + return CF->isNegativeZeroValue(); + return false; + }; + auto combineConjugation = [&](SDValue &r) { + if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) { + SDValue XOR = LHS.getOperand(0); + if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) { + SDValue XORRHS = XOR.getOperand(1); + if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse()) + XORRHS = XORRHS.getOperand(0); + if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD && + XORRHS.getOperand(1).getNumOperands()) { + ConstantPoolSDNode *CP = + dyn_cast(XORRHS.getOperand(1).getOperand(0)); + if (CP && isConjugationConstant(CP->getConstVal())) { + SelectionDAG::FlagInserter FlagsInserter(DAG, N); + SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0)); + SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F); + r = DAG.getBitcast(VT, FCMulC); + return true; + } + } + } + } + return false; + }; + SDValue Res; + if (combineConjugation(Res)) + return Res; + std::swap(LHS, RHS); + if (combineConjugation(Res)) + return Res; + return Res; +} /// Do target-specific dag combines on floating-point adds/subs. static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, @@ -51426,6 +51590,8 @@ return combineUIntToFP(N, DAG, Subtarget); case ISD::FADD: case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); + case X86ISD::VFCMULC: + case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -209,8 +209,9 @@ string MaskingConstraint = "", bit IsCommutable = 0, bit IsKCommutable = 0, - bit IsKZCommutable = IsCommutable> { - let isCommutable = IsCommutable in + bit IsKZCommutable = IsCommutable, + string ClobberConstraint = ""> { + let isCommutable = IsCommutable, Constraints = ClobberConstraint in def NAME: AVX512, EVEX_K { // In case of the 3src subclass this is overridden with a let. - string Constraints = MaskingConstraint; + string Constraints = !if(!eq(ClobberConstraint, ""), MaskingConstraint, + !if(!eq(MaskingConstraint, ""), ClobberConstraint, + !strconcat(ClobberConstraint, ", ", MaskingConstraint))); } // Zero mask does not add any restrictions to commute operands transformation. // So, it is Ok to use IsCommutable instead of IsKCommutable. - let isCommutable = IsKZCommutable in // Prefer over VMOV*rrkz Pat<> + let isCommutable = IsKZCommutable, // Prefer over VMOV*rrkz Pat<> + Constraints = ClobberConstraint in def NAME#kz: AVX512 : + bit IsKZCommutable = IsCommutable, + string ClobberConstraint = ""> : AVX512_maskable_custom; + IsKCommutable, IsKZCommutable, ClobberConstraint>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the @@ -268,7 +273,8 @@ string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskRHS, bit IsCommutable = 0, bit IsKCommutable = 0, - bit IsKZCommutable = IsCommutable> : + bit IsKZCommutable = IsCommutable, + string ClobberConstraint = "" > : AVX512_maskable_custom; + IsKZCommutable, ClobberConstraint>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the @@ -290,14 +296,15 @@ dag RHS, bit IsCommutable = 0, bit IsKCommutable = 0, bit IsKZCommutable = IsCommutable, - SDPatternOperator Select = vselect_mask> : + SDPatternOperator Select = vselect_mask, + string ClobberConstraint = ""> : AVX512_maskable_common; + IsKZCommutable, ClobberConstraint>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the scalar instruction. @@ -5749,29 +5756,34 @@ SDPatternOperator MaskOpNode, X86VectorVTInfo _, X86FoldableSchedWrite sched, bit IsCommutable, - bit IsKCommutable = IsCommutable> { + bit IsKCommutable = IsCommutable, + string suffix = _.Suffix, + string ClobberConstraint = "", + bit MayRaiseFPException = 1> { let ExeDomain = _.ExeDomain, hasSideEffects = 0, - Uses = [MXCSR], mayRaiseFPException = 1 in { + Uses = [MXCSR], mayRaiseFPException = MayRaiseFPException in { defm rr: AVX512_maskable_split, + IsKCommutable, IsKCommutable, ClobberConstraint>, EVEX_4V, Sched<[sched]>; let mayLoad = 1 in { defm rm: AVX512_maskable_split, + (MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2)), + 0, 0, 0, ClobberConstraint>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmb: AVX512_maskable_split, + (MaskOpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))), + 0, 0, 0, ClobberConstraint>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5780,12 +5792,15 @@ multiclass avx512_fp_round_packed opc, string OpcodeStr, SDPatternOperator OpNodeRnd, - X86FoldableSchedWrite sched, X86VectorVTInfo _> { + X86FoldableSchedWrite sched, X86VectorVTInfo _, + string suffix = _.Suffix, + string ClobberConstraint = ""> { let ExeDomain = _.ExeDomain, Uses = [MXCSR] in defm rrb: AVX512_maskable, + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc))), + 0, 0, 0, vselect_mask, ClobberConstraint>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>; } @@ -13509,3 +13524,135 @@ defm VRSQRT : avx512_fp16_p_vl_all<0x4E, "vrsqrtph", X86frsqrt, SchedWriteFRsqrt>; defm VRCP : avx512_fp16_p_vl_all<0x4C, "vrcpph", X86frcp, SchedWriteFRcp>; + +let Constraints = "@earlyclobber $dst, $src1 = $dst" in { + multiclass avx512_cfmop_rm opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + defm r: AVX512_maskable_3src, EVEX_4V; + + defm m: AVX512_maskable_3src, EVEX_4V; + + defm mb: AVX512_maskable_3src, EVEX_B, EVEX_4V; + } +} // Constraints = "@earlyclobber $dst, $src1 = $dst" + +multiclass avx512_cfmop_round opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let Constraints = "@earlyclobber $dst, $src1 = $dst" in + defm rb: AVX512_maskable_3src, + EVEX_4V, EVEX_B, EVEX_RC; +} + + +multiclass avx512_cfmop_common opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasFP16] in { + defm Z : avx512_cfmop_rm, + avx512_cfmop_round, + EVEX_V512; + } + let Predicates = [HasVLX, HasFP16] in { + defm Z256 : avx512_cfmop_rm, EVEX_V256; + defm Z128 : avx512_cfmop_rm, EVEX_V128; + } +} + +multiclass avx512_cfmbinop_common opc, string OpcodeStr, SDNode OpNode, + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched = SchedWriteFMA> { + let Predicates = [HasFP16] in { + defm Z : avx512_fp_packed, + avx512_fp_round_packed, EVEX_V512; + } + let Predicates = [HasVLX, HasFP16] in { + defm Z256 : avx512_fp_packed, + EVEX_V256; + defm Z128 : avx512_fp_packed, + EVEX_V128; + } +} + + +let Uses = [MXCSR] in { + defm VFMADDCPH : avx512_cfmop_common<0x56, "vfmaddcph", x86vfmaddc, x86vfmaddcRnd>, + T_MAP6XS, EVEX_CD8<32, CD8VF>; + defm VFCMADDCPH : avx512_cfmop_common<0x56, "vfcmaddcph", x86vfcmaddc, x86vfcmaddcRnd>, + T_MAP6XD, EVEX_CD8<32, CD8VF>; + + defm VFMULCPH : avx512_cfmbinop_common<0xD6, "vfmulcph", x86vfmulc, x86vfmulc, + x86vfmulcRnd>, T_MAP6XS, EVEX_CD8<32, CD8VF>; + defm VFCMULCPH : avx512_cfmbinop_common<0xD6, "vfcmulcph", x86vfcmulc, + x86vfcmulc, x86vfcmulcRnd>, + T_MAP6XD, EVEX_CD8<32, CD8VF>; +} + + +multiclass avx512_cfmop_sh_common opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched = SchedWriteFMA> { + let Predicates = [HasFP16], Constraints = "@earlyclobber $dst, $src1 = $dst" in { + defm r : AVX512_maskable_3src, + Sched<[sched.XMM]>; + defm m : AVX512_maskable_3src, + Sched<[sched.XMM.Folded, sched.XMM.ReadAfterFold]>; + defm rb : AVX512_maskable_3src, + EVEX_B, EVEX_RC, Sched<[sched.XMM]>; + } +} + +multiclass avx512_cfmbinop_sh_common opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched = SchedWriteFMA> { + let Predicates = [HasFP16] in { + defm rr : AVX512_maskable, Sched<[sched.XMM]>; + defm rm : AVX512_maskable, + Sched<[sched.XMM.Folded, sched.XMM.ReadAfterFold]>; + defm rrb : AVX512_maskable, + EVEX_B, EVEX_RC, Sched<[sched.XMM]>; + } +} + +let Uses = [MXCSR] in { + defm VFMADDCSHZ : avx512_cfmop_sh_common<0x57, "vfmaddcsh", x86vfmaddcSh, x86vfmaddcShRnd>, + T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V; + defm VFCMADDCSHZ : avx512_cfmop_sh_common<0x57, "vfcmaddcsh", x86vfcmaddcSh, x86vfcmaddcShRnd>, + T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V; + + defm VFMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfmulcsh", x86vfmulcSh, x86vfmulcShRnd>, + T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V; + defm VFCMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfcmulcsh", x86vfcmulcSh, x86vfcmulcShRnd>, + T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V; +} diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -1846,6 +1846,10 @@ { X86::VEXPANDPSZ128rrkz, X86::VEXPANDPSZ128rmkz, TB_NO_REVERSE }, { X86::VEXPANDPSZ256rrkz, X86::VEXPANDPSZ256rmkz, TB_NO_REVERSE }, { X86::VEXPANDPSZrrkz, X86::VEXPANDPSZrmkz, TB_NO_REVERSE }, + { X86::VFCMULCPHZ128rr, X86::VFCMULCPHZ128rm, 0 }, + { X86::VFCMULCPHZ256rr, X86::VFCMULCPHZ256rm, 0 }, + { X86::VFCMULCPHZrr, X86::VFCMULCPHZrm, 0 }, + { X86::VFCMULCSHZrr, X86::VFCMULCSHZrm, TB_NO_REVERSE }, { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, 0 }, { X86::VFMADDPD4rr, X86::VFMADDPD4mr, 0 }, { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, 0 }, @@ -1870,6 +1874,10 @@ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE }, { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 }, { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE }, + { X86::VFMULCPHZ128rr, X86::VFMULCPHZ128rm, 0 }, + { X86::VFMULCPHZ256rr, X86::VFMULCPHZ256rm, 0 }, + { X86::VFMULCPHZrr, X86::VFMULCPHZrm, 0 }, + { X86::VFMULCSHZrr, X86::VFMULCSHZrm, TB_NO_REVERSE }, { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, 0 }, { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, 0 }, { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, 0 }, @@ -3275,6 +3283,14 @@ { X86::VEXPANDPSZ128rrk, X86::VEXPANDPSZ128rmk, TB_NO_REVERSE }, { X86::VEXPANDPSZ256rrk, X86::VEXPANDPSZ256rmk, TB_NO_REVERSE }, { X86::VEXPANDPSZrrk, X86::VEXPANDPSZrmk, TB_NO_REVERSE }, + { X86::VFCMADDCPHZ128r, X86::VFCMADDCPHZ128m, 0 }, + { X86::VFCMADDCPHZ256r, X86::VFCMADDCPHZ256m, 0 }, + { X86::VFCMADDCPHZr, X86::VFCMADDCPHZm, 0 }, + { X86::VFCMADDCSHZr, X86::VFCMADDCSHZm, TB_NO_REVERSE }, + { X86::VFCMULCPHZ128rrkz, X86::VFCMULCPHZ128rmkz, 0 }, + { X86::VFCMULCPHZ256rrkz, X86::VFCMULCPHZ256rmkz, 0 }, + { X86::VFCMULCPHZrrkz, X86::VFCMULCPHZrmkz, 0 }, + { X86::VFCMULCSHZrrkz, X86::VFCMULCSHZrmkz, TB_NO_REVERSE }, { X86::VFIXUPIMMPDZ128rri, X86::VFIXUPIMMPDZ128rmi, 0 }, { X86::VFIXUPIMMPDZ256rri, X86::VFIXUPIMMPDZ256rmi, 0 }, { X86::VFIXUPIMMPDZrri, X86::VFIXUPIMMPDZrmi, 0 }, @@ -3352,6 +3368,10 @@ { X86::VFMADD231SSZr_Int, X86::VFMADD231SSZm_Int, TB_NO_REVERSE }, { X86::VFMADD231SSr, X86::VFMADD231SSm, 0 }, { X86::VFMADD231SSr_Int, X86::VFMADD231SSm_Int, TB_NO_REVERSE }, + { X86::VFMADDCPHZ128r, X86::VFMADDCPHZ128m, 0 }, + { X86::VFMADDCPHZ256r, X86::VFMADDCPHZ256m, 0 }, + { X86::VFMADDCPHZr, X86::VFMADDCPHZm, 0 }, + { X86::VFMADDCSHZr, X86::VFMADDCSHZm, TB_NO_REVERSE }, { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, 0 }, { X86::VFMADDPD4rr, X86::VFMADDPD4rm, 0 }, { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, 0 }, @@ -3523,6 +3543,10 @@ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE }, { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 }, { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE }, + { X86::VFMULCPHZ128rrkz, X86::VFMULCPHZ128rmkz, 0 }, + { X86::VFMULCPHZ256rrkz, X86::VFMULCPHZ256rmkz, 0 }, + { X86::VFMULCPHZrrkz, X86::VFMULCPHZrmkz, 0 }, + { X86::VFMULCSHZrrkz, X86::VFMULCSHZrmkz, TB_NO_REVERSE }, { X86::VFNMADD132PDYr, X86::VFNMADD132PDYm, 0 }, { X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128m, 0 }, { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256m, 0 }, @@ -4655,6 +4679,18 @@ { X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mkz, 0 }, { X86::VDPBF16PSZrk, X86::VDPBF16PSZmk, 0 }, { X86::VDPBF16PSZrkz, X86::VDPBF16PSZmkz, 0 }, + { X86::VFCMADDCPHZ128rk, X86::VFCMADDCPHZ128mk, 0 }, + { X86::VFCMADDCPHZ128rkz, X86::VFCMADDCPHZ128mkz, 0 }, + { X86::VFCMADDCPHZ256rk, X86::VFCMADDCPHZ256mk, 0 }, + { X86::VFCMADDCPHZ256rkz, X86::VFCMADDCPHZ256mkz, 0 }, + { X86::VFCMADDCPHZrk, X86::VFCMADDCPHZmk, 0 }, + { X86::VFCMADDCPHZrkz, X86::VFCMADDCPHZmkz, 0 }, + { X86::VFCMADDCSHZrk, X86::VFCMADDCSHZmk, TB_NO_REVERSE }, + { X86::VFCMADDCSHZrkz, X86::VFCMADDCSHZmkz, TB_NO_REVERSE }, + { X86::VFCMULCPHZ128rrk, X86::VFCMULCPHZ128rmk, 0 }, + { X86::VFCMULCPHZ256rrk, X86::VFCMULCPHZ256rmk, 0 }, + { X86::VFCMULCPHZrrk, X86::VFCMULCPHZrmk, 0 }, + { X86::VFCMULCSHZrrk, X86::VFCMULCSHZrmk, TB_NO_REVERSE }, { X86::VFIXUPIMMPDZ128rrik, X86::VFIXUPIMMPDZ128rmik, 0 }, { X86::VFIXUPIMMPDZ128rrikz, X86::VFIXUPIMMPDZ128rmikz, 0 }, { X86::VFIXUPIMMPDZ256rrik, X86::VFIXUPIMMPDZ256rmik, 0 }, @@ -4743,6 +4779,14 @@ { X86::VFMADD231SHZr_Intkz, X86::VFMADD231SHZm_Intkz, TB_NO_REVERSE }, { X86::VFMADD231SSZr_Intk, X86::VFMADD231SSZm_Intk, TB_NO_REVERSE }, { X86::VFMADD231SSZr_Intkz, X86::VFMADD231SSZm_Intkz, TB_NO_REVERSE }, + { X86::VFMADDCPHZ128rk, X86::VFMADDCPHZ128mk, 0 }, + { X86::VFMADDCPHZ128rkz, X86::VFMADDCPHZ128mkz, 0 }, + { X86::VFMADDCPHZ256rk, X86::VFMADDCPHZ256mk, 0 }, + { X86::VFMADDCPHZ256rkz, X86::VFMADDCPHZ256mkz, 0 }, + { X86::VFMADDCPHZrk, X86::VFMADDCPHZmk, 0 }, + { X86::VFMADDCPHZrkz, X86::VFMADDCPHZmkz, 0 }, + { X86::VFMADDCSHZrk, X86::VFMADDCSHZmk, TB_NO_REVERSE }, + { X86::VFMADDCSHZrkz, X86::VFMADDCSHZmkz, TB_NO_REVERSE }, { X86::VFMADDSUB132PDZ128rk, X86::VFMADDSUB132PDZ128mk, 0 }, { X86::VFMADDSUB132PDZ128rkz, X86::VFMADDSUB132PDZ128mkz, 0 }, { X86::VFMADDSUB132PDZ256rk, X86::VFMADDSUB132PDZ256mk, 0 }, @@ -4923,6 +4967,10 @@ { X86::VFMSUBADD231PSZ256rkz, X86::VFMSUBADD231PSZ256mkz, 0 }, { X86::VFMSUBADD231PSZrk, X86::VFMSUBADD231PSZmk, 0 }, { X86::VFMSUBADD231PSZrkz, X86::VFMSUBADD231PSZmkz, 0 }, + { X86::VFMULCPHZ128rrk, X86::VFMULCPHZ128rmk, 0 }, + { X86::VFMULCPHZ256rrk, X86::VFMULCPHZ256rmk, 0 }, + { X86::VFMULCPHZrrk, X86::VFMULCPHZrmk, 0 }, + { X86::VFMULCSHZrrk, X86::VFMULCSHZrmk, TB_NO_REVERSE }, { X86::VFNMADD132PDZ128rk, X86::VFNMADD132PDZ128mk, 0 }, { X86::VFNMADD132PDZ128rkz, X86::VFNMADD132PDZ128mkz, 0 }, { X86::VFNMADD132PDZ256rk, X86::VFNMADD132PDZ256mk, 0 }, diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -571,6 +571,24 @@ def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>; def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>; +def x86vfmaddc : SDNode<"X86ISD::VFMADDC", SDTFPTernaryOp>; +def x86vfmaddcRnd : SDNode<"X86ISD::VFMADDC_RND", SDTFmaRound>; +def x86vfcmaddc : SDNode<"X86ISD::VFCMADDC", SDTFPTernaryOp>; +def x86vfcmaddcRnd : SDNode<"X86ISD::VFCMADDC_RND", SDTFmaRound>; +def x86vfmulc : SDNode<"X86ISD::VFMULC", SDTFPBinOp>; +def x86vfmulcRnd : SDNode<"X86ISD::VFMULC_RND", SDTFPBinOpRound>; +def x86vfcmulc : SDNode<"X86ISD::VFCMULC", SDTFPBinOp>; +def x86vfcmulcRnd : SDNode<"X86ISD::VFCMULC_RND", SDTFPBinOpRound>; + +def x86vfmaddcSh : SDNode<"X86ISD::VFMADDCSH", SDTFPTernaryOp>; +def x86vfcmaddcSh : SDNode<"X86ISD::VFCMADDCSH", SDTFPTernaryOp>; +def x86vfmulcSh : SDNode<"X86ISD::VFMULCSH", SDTFPBinOp>; +def x86vfcmulcSh : SDNode<"X86ISD::VFCMULCSH", SDTFPBinOp>; +def x86vfmaddcShRnd : SDNode<"X86ISD::VFMADDCSH_RND", SDTFmaRound>; +def x86vfcmaddcShRnd : SDNode<"X86ISD::VFCMADDCSH_RND",SDTFmaRound>; +def x86vfmulcShRnd : SDNode<"X86ISD::VFMULCSH_RND", SDTFPBinOpRound>; +def x86vfcmulcShRnd : SDNode<"X86ISD::VFCMULCSH_RND", SDTFPBinOpRound>; + def X86rsqrt14 : SDNode<"X86ISD::RSQRT14", SDTFPUnaryOp>; def X86rcp14 : SDNode<"X86ISD::RCP14", SDTFPUnaryOp>; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -24,6 +24,7 @@ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP_IMM8, INTR_TYPE_3OP_IMM8, + FMA_OP_MASK, FMA_OP_MASKZ, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI, CVTPD2PS_MASK, INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE, @@ -1160,6 +1161,30 @@ X86ISD::CVTUI2P, X86ISD::MCVTUI2P), X86_INTRINSIC_DATA(avx512fp16_mask_vcvtuqq2ph_256, TRUNCATE_TO_REG, X86ISD::CVTUI2P, X86ISD::MCVTUI2P), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmaddc_ph_128, FMA_OP_MASK, X86ISD::VFCMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmaddc_ph_256, FMA_OP_MASK, X86ISD::VFCMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmaddc_ph_512, FMA_OP_MASK, X86ISD::VFCMADDC, X86ISD::VFCMADDC_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmaddc_sh, FMA_OP_MASK, X86ISD::VFCMADDCSH, X86ISD::VFCMADDCSH_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmulc_ph_128, INTR_TYPE_2OP_MASK, X86ISD::VFCMULC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmulc_ph_256, INTR_TYPE_2OP_MASK, X86ISD::VFCMULC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmulc_ph_512, INTR_TYPE_2OP_MASK, X86ISD::VFCMULC, X86ISD::VFCMULC_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmulc_sh, INTR_TYPE_SCALAR_MASK, X86ISD::VFCMULCSH, X86ISD::VFCMULCSH_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmaddc_ph_128, FMA_OP_MASK, X86ISD::VFMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmaddc_ph_256, FMA_OP_MASK, X86ISD::VFMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmaddc_ph_512, FMA_OP_MASK, X86ISD::VFMADDC, X86ISD::VFMADDC_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmaddc_sh, FMA_OP_MASK, X86ISD::VFMADDCSH, X86ISD::VFMADDCSH_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmulc_ph_128, INTR_TYPE_2OP_MASK, X86ISD::VFMULC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmulc_ph_256, INTR_TYPE_2OP_MASK, X86ISD::VFMULC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmulc_ph_512, INTR_TYPE_2OP_MASK, X86ISD::VFMULC, X86ISD::VFMULC_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmulc_sh, INTR_TYPE_SCALAR_MASK, X86ISD::VFMULCSH, X86ISD::VFMULCSH_RND), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmaddc_ph_128, FMA_OP_MASKZ, X86ISD::VFCMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmaddc_ph_256, FMA_OP_MASKZ, X86ISD::VFCMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmaddc_ph_512, FMA_OP_MASKZ, X86ISD::VFCMADDC, X86ISD::VFCMADDC_RND), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmaddc_sh, FMA_OP_MASKZ, X86ISD::VFCMADDCSH, X86ISD::VFCMADDCSH_RND), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfmaddc_ph_128, FMA_OP_MASKZ, X86ISD::VFMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfmaddc_ph_256, FMA_OP_MASKZ, X86ISD::VFMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfmaddc_ph_512, FMA_OP_MASKZ, X86ISD::VFMADDC, X86ISD::VFMADDC_RND), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfmaddc_sh, FMA_OP_MASKZ, X86ISD::VFMADDCSH, X86ISD::VFMADDCSH_RND), X86_INTRINSIC_DATA(avx512fp16_max_ph_128, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512fp16_max_ph_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512fp16_max_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE), diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td --- a/llvm/lib/Target/X86/X86ScheduleZnver3.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td @@ -61,7 +61,7 @@ let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. - let CompleteModel = 1; + let CompleteModel = 0; } let SchedModel = Znver3Model in { diff --git a/llvm/test/CodeGen/X86/avx512cfma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512cfma-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512cfma-intrinsics.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s + +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) +declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float> @test_int_x86_avx512fp8_mask_cfmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfmadd_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp8_maskz_cfmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_maskz_cfmadd_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.ph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp8_cfmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2){ +; CHECK-LABEL: test_int_x86_avx512fp8_cfmadd_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmaddcph %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.128(<4 x float> %x2, <4 x float> %x1, <4 x float> %x0, i8 -1) + ret <4 x float> %res +} + + +declare <8 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) +declare <8 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float> @test_int_x86_avx512fp16_mask_cfmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmadd_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + ret <8 x float> %res +} + +define <8 x float> @test_int_x86_avx512fp16_maskz_cfmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmadd_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.ph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + ret <8 x float> %res +} + +define <8 x float> @test_int_x86_avx512fp16_cfmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfmadd_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmaddcph %ymm0, %ymm1, %ymm2 +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.256(<8 x float> %x2, <8 x float> %x1, <8 x float> %x0, i8 -1) + ret <8 x float> %res +} + + +declare <16 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_int_x86_avx512fp16_mask_cfmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmadd_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_maskz_cfmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmadd_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.ph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_cfmadd_ph_512_rn(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfmadd_ph_512_rn: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmaddcph {rz-sae}, %zmm0, %zmm1, %zmm2 +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 11) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_cfmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfmadd_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmaddcph %zmm0, %zmm1, %zmm2 +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 4) + ret <16 x float> %res +} + +declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) +declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float> @test_int_x86_avx512fp8_mask_cfcmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfcmadd_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp8_maskz_cfcmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_maskz_cfcmadd_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp8_cfcmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2){ +; CHECK-LABEL: test_int_x86_avx512fp8_cfcmadd_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmaddcph %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.128(<4 x float> %x2, <4 x float> %x1, <4 x float> %x0, i8 -1) + ret <4 x float> %res +} + + +declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) +declare <8 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float> @test_int_x86_avx512fp16_mask_cfcmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmadd_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + ret <8 x float> %res +} + +define <8 x float> @test_int_x86_avx512fp16_maskz_cfcmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmadd_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + ret <8 x float> %res +} + +define <8 x float> @test_int_x86_avx512fp16_cfcmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfcmadd_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmaddcph %ymm0, %ymm1, %ymm2 +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.256(<8 x float> %x2, <8 x float> %x1, <8 x float> %x0, i8 -1) + ret <8 x float> %res +} + + +declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_int_x86_avx512fp16_mask_cfcmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmadd_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_maskz_cfcmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmadd_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_cfcmadd_ph_512_rn(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfcmadd_ph_512_rn: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmaddcph {rz-sae}, %zmm0, %zmm1, %zmm2 +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 11) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_cfcmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfcmadd_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmaddcph %zmm0, %zmm1, %zmm2 +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 4) + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/avx512cfmul-intrinsics.ll b/llvm/test/CodeGen/X86/avx512cfmul-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512cfmul-intrinsics.ll @@ -0,0 +1,218 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s + +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float> @test_int_x86_avx512fp8_mask_cfmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfmul_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp8_maskz_cfmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_maskz_cfmul_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp8_cfmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2){ +; CHECK-LABEL: test_int_x86_avx512fp8_cfmul_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmulcph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %x2, <4 x float> %x1, <4 x float> %x0, i8 -1) + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float> @test_int_x86_avx512fp16_mask_cfmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmul_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmulcph %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + ret <8 x float> %res +} + +define <8 x float> @test_int_x86_avx512fp16_maskz_cfmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmul_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmulcph %ymm1, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> zeroinitializer, i8 %x3) + ret <8 x float> %res +} + +define <8 x float> @test_int_x86_avx512fp16_cfmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfmul_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmulcph %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float> %x2, <8 x float> %x1, <8 x float> %x0, i8 -1) + ret <8 x float> %res +} + +declare <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_int_x86_avx512fp16_mask_cfmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmul_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_maskz_cfmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmul_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> zeroinitializer, i16 %x3, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_cfmul_ph_512_rn(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfmul_ph_512_rn: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmulcph {rz-sae}, %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 11) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_cfmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfmul_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmulcph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 4) + ret <16 x float> %res +} + +declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float> @test_int_x86_avx512fp8_mask_cfcmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfcmul_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmulcph %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp8_maskz_cfcmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_maskz_cfcmul_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmulcph %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp8_cfcmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2){ +; CHECK-LABEL: test_int_x86_avx512fp8_cfcmul_ph_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmulcph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.128(<4 x float> %x2, <4 x float> %x1, <4 x float> %x0, i8 -1) + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float> @test_int_x86_avx512fp16_mask_cfcmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmul_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmulcph %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + ret <8 x float> %res +} + +define <8 x float> @test_int_x86_avx512fp16_maskz_cfcmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmul_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmulcph %ymm1, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> zeroinitializer, i8 %x3) + ret <8 x float> %res +} + +define <8 x float> @test_int_x86_avx512fp16_cfcmul_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfcmul_ph_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmulcph %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.256(<8 x float> %x2, <8 x float> %x1, <8 x float> %x0, i8 -1) + ret <8 x float> %res +} + +declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_int_x86_avx512fp16_mask_cfcmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmul_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmulcph %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_maskz_cfcmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmul_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmulcph %zmm1, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> zeroinitializer, i16 %x3, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_cfcmul_ph_512_rn(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfcmul_ph_512_rn: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmulcph {rz-sae}, %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 11) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512fp16_cfcmul_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp16_cfcmul_ph_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmulcph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 4) + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll @@ -0,0 +1,267 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s + +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmulc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmulc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +;; no mask, no rounding + +define <4 x float> @test_nm_nr_int_x86_avx512fp16_mask_cfmul_sh(<4 x float> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_nm_nr_int_x86_avx512fp16_mask_cfmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> undef, i8 -1, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_nm_nr_int_x86_avx512fp16_mask_cfcmul_sh(<4 x float> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_nm_nr_int_x86_avx512fp16_mask_cfcmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> undef, i8 -1, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_nm_nr_int_x86_avx512fp16_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_nm_nr_int_x86_avx512fp16_cfmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmaddcsh %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_nm_nr_int_x86_avx512fp16_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_nm_nr_int_x86_avx512fp16_cfcmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmaddcsh %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + ret <4 x float> %res +} + +;; no mask, rounding + +define <4 x float> @test_nm_r_int_x86_avx512fp16_mask_cfmul_sh(<4 x float> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_nm_r_int_x86_avx512fp16_mask_cfmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> undef, i8 -1, i32 9) + ret <4 x float> %res +} + +define <4 x float> @test_nm_r_int_x86_avx512fp16_mask_cfcmul_sh(<4 x float> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_nm_r_int_x86_avx512fp16_mask_cfcmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> undef, i8 -1, i32 9) + ret <4 x float> %res +} + +define <4 x float> @test_nm_r_int_x86_avx512fp16_mask_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_nm_r_int_x86_avx512fp16_mask_cfmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 9) + ret <4 x float> %res +} + +define <4 x float> @test_nm_r_int_x86_avx512fp16_mask_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_nm_r_int_x86_avx512fp16_mask_cfcmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 9) + ret <4 x float> %res +} + +;; mask, no rounding + +define <4 x float> @test_m_nr_int_x86_avx512fp16_mask_cfmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_mask_cfmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_m_nr_int_x86_avx512fp16_mask_cfcmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_mask_cfcmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_m_nr_int_x86_avx512fp16_mask_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_mask_cfmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddcsh %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_m_nr_int_x86_avx512fp16_mask_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_mask_cfcmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmaddcsh %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + ret <4 x float> %res +} + +;; mask, rounding + +define <4 x float> @test_int_x86_avx512fp16_mask_cfmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp16_mask_cfcmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp16_mask_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp16_mask_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9) + ret <4 x float> %res +} + +;; maskz, no rounding + +define <4 x float> @test_m_nr_int_x86_avx512fp16_maskz_cfmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_maskz_cfmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_m_nr_int_x86_avx512fp16_maskz_cfcmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_maskz_cfcmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_m_nr_int_x86_avx512fp16_maskz_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_maskz_cfmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddcsh %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_m_nr_int_x86_avx512fp16_maskz_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_maskz_cfcmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmaddcsh %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + ret <4 x float> %res +} + +;; maskz, rounding + +define <4 x float> @test_int_x86_avx512fp16_maskz_cfmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3, i32 9) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp16_maskz_cfcmul_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmulcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x3, i32 9) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp16_maskz_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp16_maskz_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmadd_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9) + ret <4 x float> %res +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s + +define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <32 x half> %lhs.coerce to <16 x float> + %1 = bitcast <32 x half> %rhs.coerce to <16 x float> + %2 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) + %3 = bitcast <16 x float> %2 to <32 x half> + %add.i.i = fadd fast <32 x half> %3, %acc.coerce + ret <32 x half> %add.i.i +} + +define dso_local <16 x half> @test2(<16 x half> %acc.coerce, <16 x half> %lhs.coerce, <16 x half> %rhs.coerce) { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <16 x half> %lhs.coerce to <8 x float> + %1 = bitcast <16 x half> %rhs.coerce to <8 x float> + %2 = tail call fast <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1) + %3 = bitcast <8 x float> %2 to <16 x half> + %add.i.i = fadd fast <16 x half> %3, %acc.coerce + ret <16 x half> %add.i.i +} + +define dso_local <8 x half> @test3(<8 x half> %acc.coerce, <8 x half> %lhs.coerce, <8 x half> %rhs.coerce) { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x half> %lhs.coerce to <4 x float> + %1 = bitcast <8 x half> %rhs.coerce to <4 x float> + %2 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1) + %3 = bitcast <4 x float> %2 to <8 x half> + %add.i.i = fadd fast <8 x half> %3, %acc.coerce + ret <8 x half> %add.i.i +} + + +define dso_local <8 x half> @test4(<8 x half> %acc.coerce, <8 x half> %lhs.coerce, <8 x half> %rhs.coerce) { +; CHECK-LABEL: test4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x half> %lhs.coerce to <4 x float> + %1 = bitcast <8 x half> %rhs.coerce to <4 x float> + %2 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1) + %3 = bitcast <4 x float> %2 to <8 x half> + %add.i.i = fadd fast <8 x half> %acc.coerce, %3 + ret <8 x half> %add.i.i +} + +declare <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg) +declare <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll @@ -0,0 +1,150 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s + +define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32> + %xor.i.i = xor <16 x i32> %0, + %1 = bitcast <16 x i32> %xor.i.i to <16 x float> + %2 = bitcast <32 x half> %rhs.coerce to <16 x float> + %3 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %1, <16 x float> %2, <16 x float> zeroinitializer, i16 -1, i32 4) #2 + %4 = bitcast <16 x float> %3 to <32 x half> + %add = fadd fast <32 x half> %4, %acc.coerce + ret <32 x half> %add +} + +define dso_local <32 x half> @test2(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32> + %xor.i.i = xor <16 x i32> %0, + %1 = bitcast <16 x i32> %xor.i.i to <16 x float> + %2 = bitcast <32 x half> %rhs.coerce to <16 x float> + %3 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %2, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) #2 + %4 = bitcast <16 x float> %3 to <32 x half> + %add = fadd fast <32 x half> %4, %acc.coerce + ret <32 x half> %add +} + +define dso_local <16 x half> @test3(<16 x half> %acc.coerce, <16 x half> %lhs.coerce.conj, <16 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfcmaddcph %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <16 x half> %lhs.coerce.conj to <8 x i32> + %xor.i.i = xor <8 x i32> %0, + %1 = bitcast <8 x i32> %xor.i.i to <8 x float> + %2 = bitcast <16 x half> %rhs.coerce to <8 x float> + %3 = tail call fast <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float> %1, <8 x float> %2, <8 x float> zeroinitializer, i8 -1) #2 + %4 = bitcast <8 x float> %3 to <16 x half> + %add = fadd fast <16 x half> %4, %acc.coerce + ret <16 x half> %add +} + +define dso_local <8 x half> @test4(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32> + %xor.i.i = xor <4 x i32> %0, + %1 = bitcast <4 x i32> %xor.i.i to <4 x float> + %2 = bitcast <8 x half> %rhs.coerce to <4 x float> + %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2 + %4 = bitcast <4 x float> %3 to <8 x half> + %add = fadd fast <8 x half> %4, %acc.coerce + ret <8 x half> %add +} + +define dso_local <8 x half> @test5(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32> + %xor.i.i = xor <4 x i32> , %0 + %1 = bitcast <4 x i32> %xor.i.i to <4 x float> + %2 = bitcast <8 x half> %rhs.coerce to <4 x float> + %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2 + %4 = bitcast <4 x float> %3 to <8 x half> + %add = fadd fast <8 x half> %4, %acc.coerce + ret <8 x half> %add +} + +define dso_local <8 x half> @test6(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32> + %xor.i.i = xor <4 x i32> , %0 + %1 = bitcast <4 x i32> %xor.i.i to <4 x float> + %2 = bitcast <8 x half> %rhs.coerce to <4 x float> + %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2 + %4 = bitcast <4 x float> %3 to <8 x half> + %add = fadd fast <8 x half> %4, %acc.coerce + ret <8 x half> %add +} + +define dso_local <8 x half> @test7(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32> + %xor.i.i = xor <4 x i32> , %0 + %1 = bitcast <4 x i32> %xor.i.i to <4 x float> + %2 = bitcast <8 x half> %rhs.coerce to <4 x float> + %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2 + %4 = bitcast <4 x float> %3 to <8 x half> + %add = fadd fast <8 x half> %acc.coerce, %4 + ret <8 x half> %add +} + +define dso_local <8 x half> @test8(<8 x half> %acc.coerce, <4 x float> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <4 x float> %lhs.coerce.conj to <4 x i32> + %xor.i.i = xor <4 x i32> , %0 + %1 = bitcast <4 x i32> %xor.i.i to <4 x float> + %2 = bitcast <8 x half> %rhs.coerce to <4 x float> + %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2 + %4 = bitcast <4 x float> %3 to <8 x half> + %add = fadd fast <8 x half> %acc.coerce, %4 + ret <8 x half> %add +} + +define dso_local <32 x half> @test9(<32 x half> %acc.coerce, <8 x i64> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test9: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq +entry: + %xor1.i = xor <8 x i64> %lhs.coerce.conj, + %0 = bitcast <8 x i64> %xor1.i to <16 x float> + %1 = bitcast <32 x half> %rhs.coerce to <16 x float> + %2 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) #2 + %3 = bitcast <16 x float> %2 to <32 x half> + %add = fadd fast <32 x half> %3, %acc.coerce + ret <32 x half> %add +} + +declare <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg) +declare <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s + +define dso_local <32 x half> @test1(<32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfcmulcph %zmm0, %zmm1, %zmm2 +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32> + %xor.i.i = xor <16 x i32> %0, + %1 = bitcast <16 x i32> %xor.i.i to <16 x float> + %2 = bitcast <32 x half> %rhs.coerce to <16 x float> + %3 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %1, <16 x float> %2, <16 x float> zeroinitializer, i16 -1, i32 4) #2 + %4 = bitcast <16 x float> %3 to <32 x half> + ret <32 x half> %4 +} + +; Function Attrs: nounwind readnone uwtable +define dso_local <16 x half> @test2(<16 x half> %lhs.coerce.conj, <16 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfcmulcph %ymm0, %ymm1, %ymm2 +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <16 x half> %lhs.coerce.conj to <8 x i32> + %xor.i.i = xor <8 x i32> %0, + %1 = bitcast <8 x i32> %xor.i.i to <8 x float> + %2 = bitcast <16 x half> %rhs.coerce to <8 x float> + %3 = tail call fast <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float> %1, <8 x float> %2, <8 x float> zeroinitializer, i8 -1) #2 + %4 = bitcast <8 x float> %3 to <16 x half> + ret <16 x half> %4 +} + +define dso_local <8 x half> @test3(<8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfcmulcph %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32> + %xor.i.i = xor <4 x i32> %0, + %1 = bitcast <4 x i32> %xor.i.i to <4 x float> + %2 = bitcast <8 x half> %rhs.coerce to <4 x float> + %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2 + %4 = bitcast <4 x float> %3 to <8 x half> + ret <8 x half> %4 +} + +define dso_local <8 x half> @test4(<8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 { +; CHECK-LABEL: test4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmulcph %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32> + %xor.i.i = xor <4 x i32> %0, + %1 = bitcast <4 x i32> %xor.i.i to <4 x float> + %2 = bitcast <8 x half> %rhs.coerce to <4 x float> + %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2 + %4 = bitcast <4 x float> %3 to <8 x half> + ret <8 x half> %4 +} + +declare <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg) +declare <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) +declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll @@ -989,5 +989,225 @@ ret <8 x half> %5 } +define <16 x float> @stack_fold_fmulcph(<16 x float> %a0, <16 x float> %a1) { + ;CHECK-LABEL: stack_fold_fmulcph: + ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} +declare <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @stack_fold_fmulcph_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %passthru, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmulcph_mask: + ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <16 x float>, <16 x float>* %passthru + %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4) + ret <16 x float> %3 +} + +define <16 x float> @stack_fold_fmulcph_maskz(<16 x float> %a0, <16 x float> %a1, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmulcph_maskz: + ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4) + ret <16 x float> %3 +} + +define <16 x float> @stack_fold_fcmulcph(<16 x float> %a0, <16 x float> %a1) { + ;CHECK-LABEL: stack_fold_fcmulcph: + ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} +declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @stack_fold_fcmulcph_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %passthru, i16 %mask) { + ;CHECK-LABEL: stack_fold_fcmulcph_mask: + ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <16 x float>, <16 x float>* %passthru + %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4) + ret <16 x float> %3 +} + +define <16 x float> @stack_fold_fcmulcph_maskz(<16 x float> %a0, <16 x float> %a1, i16* %mask) { + ;CHECK-LABEL: stack_fold_fcmulcph_maskz: + ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4) + ret <16 x float> %3 +} + +define <16 x float> @stack_fold_fmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ;check-label: stack_fold_fmaddcph: + ;check: vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) + ret <16 x float> %2 +} +declare <16 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @stack_fold_fmaddcph_mask(<16 x float>* %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ;check-label: stack_fold_fmaddcph_mask: + ;check: vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x float>, <16 x float>* %p + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) + ret <16 x float> %2 +} + +define <16 x float> @stack_fold_fmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16* %mask) { + ;check-label: stack_fold_fmaddcph_mask: + ;check: vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %2, i32 4) + ret <16 x float> %3 +} +declare <16 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @stack_fold_fcmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ;check-label: stack_fold_fcmaddcph: + ;check: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) + ret <16 x float> %2 +} +declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @stack_fold_fcmaddcph_mask(<16 x float>* %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ;check-label: stack_fold_fcmaddcph_mask: + ;check: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x float>, <16 x float>* %p + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) + ret <16 x float> %2 +} + +define <16 x float> @stack_fold_fcmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16* %mask) { + ;check-label: stack_fold_fcmaddcph_mask: + ;check: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %2, i32 4) + ret <16 x float> %3 +} +declare <16 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <4 x float> @stack_fold_fmulcsh(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_fmulcsh: + ;CHECK: vfmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float> @stack_fold_fmulcsh_mask(<4 x float> %a0, <4 x float> %a1, <4 x float>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmulcsh_mask: + ;CHECK: vfmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <4 x float>, <4 x float>* %passthru + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4) + ret <4 x float> %3 +} + +define <4 x float> @stack_fold_fmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmulcsh_maskz: + ;CHECK: vfmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4) + ret <4 x float> %3 +} + +define <4 x float> @stack_fold_fcmulcsh(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_fcmulcsh: + ;CHECK: vfcmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float> @stack_fold_fcmulcsh_mask(<4 x float> %a0, <4 x float> %a1, <4 x float>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_fcmulcsh_mask: + ;CHECK: vfcmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <4 x float>, <4 x float>* %passthru + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4) + ret <4 x float> %3 +} + +define <4 x float> @stack_fold_fcmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_fcmulcsh_maskz: + ;CHECK: vfcmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4) + ret <4 x float> %3 +} + +define <4 x float> @stack_fold_fmaddcsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + ;check-label: stack_fold_fmaddcsh: + ;check: vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1, i32 4) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float> @stack_fold_fmaddcsh_mask(<4 x float>* %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ;check-label: stack_fold_fmaddcsh_mask: + ;check: vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <4 x float>, <4 x float>* %p + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %2 +} + +define <4 x float> @stack_fold_fmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8* %mask) { + ;check-label: stack_fold_fmaddcsh_mask: + ;check: vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %2, i32 4) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float> @stack_fold_fcmaddccsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + ;check-label: stack_fold_fcmaddccsh: + ;check: vfcmaddccsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddcc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1, i32 4) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddcc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float> @stack_fold_fcmaddccsh_mask(<4 x float>* %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ;check-label: stack_fold_fcmaddccsh_mask: + ;check: vfcmaddccsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <4 x float>, <4 x float>* %p + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddcc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %2 +} + +define <4 x float> @stack_fold_fcmaddccsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8* %mask) { + ;check-label: stack_fold_fcmaddccsh_mask: + ;check: vfcmaddccsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddcc.sh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %2, i32 4) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmaddcc.sh(<4 x float>, <4 x float>, <4 x float>, i8, i32) + attributes #0 = { "unsafe-fp-math"="false" } attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll @@ -588,5 +588,225 @@ ret <16 x half> %2 } +define <4 x float> @stack_fold_fmulc(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_fmulc: + ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float> @stack_fold_fmulc_mask(<4 x float> %a0, <4 x float> %a1, <4 x float>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmulc_mask: + ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <4 x float>, <4 x float>* %passthru + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask) + ret <4 x float> %3 +} + +define <4 x float> @stack_fold_fmulc_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmulc_maskz: + ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2) + ret <4 x float> %3 +} + +define <4 x float> @stack_fold_fcmulc(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_fcmulc: + ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float> @stack_fold_fcmulc_mask(<4 x float> %a0, <4 x float> %a1, <4 x float>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_fcmulc_mask: + ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <4 x float>, <4 x float>* %passthru + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask) + ret <4 x float> %3 +} + +define <4 x float> @stack_fold_fcmulc_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_fcmulc_maskz: + ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2) + ret <4 x float> %3 +} + +define <4 x float> @stack_fold_fmaddc(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + ;check-label: stack_fold_fmaddc: + ;check: vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float> @stack_fold_fmaddc_mask(<4 x float>* %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ;check-label: stack_fold_fmaddc_mask: + ;check: vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <4 x float>, <4 x float>* %p + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) + ret <4 x float> %2 +} + +define <4 x float> @stack_fold_fmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8* %mask) { + ;check-label: stack_fold_fmaddc_mask: + ;check: vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float> @stack_fold_fcmaddc(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + ;check-label: stack_fold_fcmaddc: + ;check: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float> @stack_fold_fcmaddc_mask(<4 x float>* %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ;check-label: stack_fold_fcmaddc_mask: + ;check: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <4 x float>, <4 x float>* %p + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) + ret <4 x float> %2 +} + +define <4 x float> @stack_fold_fcmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8* %mask) { + ;check-label: stack_fold_fcmaddc_mask: + ;check: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <8 x float> @stack_fold_fmulc_ymm(<8 x float> %a0, <8 x float> %a1) { + ;CHECK-LABEL: stack_fold_fmulc_ymm: + ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float> @stack_fold_fmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmulc_mask_ymm: + ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x float>, <8 x float>* %passthru + %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %2, i8 %mask) + ret <8 x float> %3 +} + +define <8 x float> @stack_fold_fmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmulc_maskz_ymm: + ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmulc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2) + ret <8 x float> %3 +} + +define <8 x float> @stack_fold_fcmulc_ymm(<8 x float> %a0, <8 x float> %a1) { + ;CHECK-LABEL: stack_fold_fcmulc_ymm: + ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float> @stack_fold_fcmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_fcmulc_mask_ymm: + ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x float>, <8 x float>* %passthru + %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %2, i8 %mask) + ret <8 x float> %3 +} + +define <8 x float> @stack_fold_fcmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_fcmulc_maskz_ymm: + ;CHECK: vfcmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmulc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2) + ret <8 x float> %3 +} + +define <8 x float> @stack_fold_fmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { + ;check-label: stack_fold_fmaddc_ymm: + ;check: vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 -1) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float> @stack_fold_fmaddc_mask_ymm(<8 x float>* %p, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ;check-label: stack_fold_fmaddc_mask_ymm: + ;check: vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x float>, <8 x float>* %p + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) + ret <8 x float> %2 +} + +define <8 x float> @stack_fold_fmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8* %mask) { + ;check-label: stack_fold_fmaddc_mask_ymm: + ;check: vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmaddc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx512fp16.maskz.vfmaddc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float> @stack_fold_fcmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { + ;check-label: stack_fold_fcmaddc_ymm: + ;check: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 -1) + ret <8 x float> %2 +} +declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float> @stack_fold_fcmaddc_mask_ymm(<8 x float>* %p, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ;check-label: stack_fold_fcmaddc_mask_ymm: + ;check: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x float>, <8 x float>* %p + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) + ret <8 x float> %2 +} + +define <8 x float> @stack_fold_fcmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8* %mask) { + ;check-label: stack_fold_fcmaddc_mask_ymm: + ;check: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte folded reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmaddc.ph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx512fp16.maskz.vfcmaddc.ph.256(<8 x float>, <8 x float>, <8 x float>, i8) + attributes #0 = { "unsafe-fp-math"="false" } attributes #1 = { "unsafe-fp-math"="true" } diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt --- a/llvm/test/MC/Disassembler/X86/avx512fp16.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt @@ -2484,3 +2484,195 @@ # ATT: vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} # INTEL: vfnmsub231sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] 0x62,0x66,0x15,0x87,0xbf,0x72,0x80 + +# ATT: vfcmaddcph %zmm28, %zmm29, %zmm30 +# INTEL: vfcmaddcph zmm30, zmm29, zmm28 +0x62,0x06,0x17,0x40,0x56,0xf4 + +# ATT: vfcmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfcmaddcph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x17,0x10,0x56,0xf4 + +# ATT: vfcmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfcmaddcph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x17,0x47,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfcmaddcph (%r9){1to16}, %zmm29, %zmm30 +# INTEL: vfcmaddcph zmm30, zmm29, dword ptr [r9]{1to16} +0x62,0x46,0x17,0x50,0x56,0x31 + +# ATT: vfcmaddcph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfcmaddcph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x17,0x40,0x56,0x71,0x7f + +# ATT: vfcmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfcmaddcph zmm30 {k7} {z}, zmm29, dword ptr [rdx - 512]{1to16} +0x62,0x66,0x17,0xd7,0x56,0x72,0x80 + +# ATT: vfcmaddcsh %xmm28, %xmm29, %xmm30 +# INTEL: vfcmaddcsh xmm30, xmm29, xmm28 +0x62,0x06,0x17,0x00,0x57,0xf4 + +# ATT: vfcmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfcmaddcsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x17,0x10,0x57,0xf4 + +# ATT: vfcmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfcmaddcsh xmm30 {k7}, xmm29, dword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x17,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfcmaddcsh (%r9), %xmm29, %xmm30 +# INTEL: vfcmaddcsh xmm30, xmm29, dword ptr [r9] +0x62,0x46,0x17,0x00,0x57,0x31 + +# ATT: vfcmaddcsh 508(%rcx), %xmm29, %xmm30 +# INTEL: vfcmaddcsh xmm30, xmm29, dword ptr [rcx + 508] +0x62,0x66,0x17,0x00,0x57,0x71,0x7f + +# ATT: vfcmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfcmaddcsh xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512] +0x62,0x66,0x17,0x87,0x57,0x72,0x80 + +# ATT: vfcmulcph %zmm28, %zmm29, %zmm30 +# INTEL: vfcmulcph zmm30, zmm29, zmm28 +0x62,0x06,0x17,0x40,0xd6,0xf4 + +# ATT: vfcmulcph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfcmulcph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x17,0x10,0xd6,0xf4 + +# ATT: vfcmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfcmulcph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x17,0x47,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfcmulcph (%r9){1to16}, %zmm29, %zmm30 +# INTEL: vfcmulcph zmm30, zmm29, dword ptr [r9]{1to16} +0x62,0x46,0x17,0x50,0xd6,0x31 + +# ATT: vfcmulcph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfcmulcph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x17,0x40,0xd6,0x71,0x7f + +# ATT: vfcmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfcmulcph zmm30 {k7} {z}, zmm29, dword ptr [rdx - 512]{1to16} +0x62,0x66,0x17,0xd7,0xd6,0x72,0x80 + +# ATT: vfcmulcsh %xmm28, %xmm29, %xmm30 +# INTEL: vfcmulcsh xmm30, xmm29, xmm28 +0x62,0x06,0x17,0x00,0xd7,0xf4 + +# ATT: vfcmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfcmulcsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x17,0x10,0xd7,0xf4 + +# ATT: vfcmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfcmulcsh xmm30 {k7}, xmm29, dword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x17,0x07,0xd7,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfcmulcsh (%r9), %xmm29, %xmm30 +# INTEL: vfcmulcsh xmm30, xmm29, dword ptr [r9] +0x62,0x46,0x17,0x00,0xd7,0x31 + +# ATT: vfcmulcsh 508(%rcx), %xmm29, %xmm30 +# INTEL: vfcmulcsh xmm30, xmm29, dword ptr [rcx + 508] +0x62,0x66,0x17,0x00,0xd7,0x71,0x7f + +# ATT: vfcmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfcmulcsh xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512] +0x62,0x66,0x17,0x87,0xd7,0x72,0x80 + +# ATT: vfmaddcph %zmm28, %zmm29, %zmm30 +# INTEL: vfmaddcph zmm30, zmm29, zmm28 +0x62,0x06,0x16,0x40,0x56,0xf4 + +# ATT: vfmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmaddcph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x16,0x10,0x56,0xf4 + +# ATT: vfmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmaddcph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x16,0x47,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmaddcph (%r9){1to16}, %zmm29, %zmm30 +# INTEL: vfmaddcph zmm30, zmm29, dword ptr [r9]{1to16} +0x62,0x46,0x16,0x50,0x56,0x31 + +# ATT: vfmaddcph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmaddcph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x16,0x40,0x56,0x71,0x7f + +# ATT: vfmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmaddcph zmm30 {k7} {z}, zmm29, dword ptr [rdx - 512]{1to16} +0x62,0x66,0x16,0xd7,0x56,0x72,0x80 + +# ATT: vfmaddcsh %xmm28, %xmm29, %xmm30 +# INTEL: vfmaddcsh xmm30, xmm29, xmm28 +0x62,0x06,0x16,0x00,0x57,0xf4 + +# ATT: vfmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfmaddcsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x16,0x10,0x57,0xf4 + +# ATT: vfmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfmaddcsh xmm30 {k7}, xmm29, dword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x16,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmaddcsh (%r9), %xmm29, %xmm30 +# INTEL: vfmaddcsh xmm30, xmm29, dword ptr [r9] +0x62,0x46,0x16,0x00,0x57,0x31 + +# ATT: vfmaddcsh 508(%rcx), %xmm29, %xmm30 +# INTEL: vfmaddcsh xmm30, xmm29, dword ptr [rcx + 508] +0x62,0x66,0x16,0x00,0x57,0x71,0x7f + +# ATT: vfmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfmaddcsh xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512] +0x62,0x66,0x16,0x87,0x57,0x72,0x80 + +# ATT: vfmulcph %zmm28, %zmm29, %zmm30 +# INTEL: vfmulcph zmm30, zmm29, zmm28 +0x62,0x06,0x16,0x40,0xd6,0xf4 + +# ATT: vfmulcph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmulcph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x16,0x10,0xd6,0xf4 + +# ATT: vfmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmulcph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x16,0x47,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmulcph (%r9){1to16}, %zmm29, %zmm30 +# INTEL: vfmulcph zmm30, zmm29, dword ptr [r9]{1to16} +0x62,0x46,0x16,0x50,0xd6,0x31 + +# ATT: vfmulcph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmulcph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x16,0x40,0xd6,0x71,0x7f + +# ATT: vfmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmulcph zmm30 {k7} {z}, zmm29, dword ptr [rdx - 512]{1to16} +0x62,0x66,0x16,0xd7,0xd6,0x72,0x80 + +# ATT: vfmulcsh %xmm28, %xmm29, %xmm30 +# INTEL: vfmulcsh xmm30, xmm29, xmm28 +0x62,0x06,0x16,0x00,0xd7,0xf4 + +# ATT: vfmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfmulcsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x16,0x10,0xd7,0xf4 + +# ATT: vfmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfmulcsh xmm30 {k7}, xmm29, dword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x16,0x07,0xd7,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmulcsh (%r9), %xmm29, %xmm30 +# INTEL: vfmulcsh xmm30, xmm29, dword ptr [r9] +0x62,0x46,0x16,0x00,0xd7,0x31 + +# ATT: vfmulcsh 508(%rcx), %xmm29, %xmm30 +# INTEL: vfmulcsh xmm30, xmm29, dword ptr [rcx + 508] +0x62,0x66,0x16,0x00,0xd7,0x71,0x7f + +# ATT: vfmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfmulcsh xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512] +0x62,0x66,0x16,0x87,0xd7,0x72,0x80 diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt --- a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt @@ -2212,3 +2212,163 @@ # ATT: vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} # INTEL: vfnmsub231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} 0x62,0xf6,0x55,0x9f,0xbe,0x72,0x80 + +# ATT: vfcmaddcph %ymm4, %ymm5, %ymm6 +# INTEL: vfcmaddcph ymm6, ymm5, ymm4 +0x62,0xf6,0x57,0x28,0x56,0xf4 + +# ATT: vfcmaddcph %xmm4, %xmm5, %xmm6 +# INTEL: vfcmaddcph xmm6, xmm5, xmm4 +0x62,0xf6,0x57,0x08,0x56,0xf4 + +# ATT: vfcmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfcmaddcph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x57,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfcmaddcph (%ecx){1to8}, %ymm5, %ymm6 +# INTEL: vfcmaddcph ymm6, ymm5, dword ptr [ecx]{1to8} +0x62,0xf6,0x57,0x38,0x56,0x31 + +# ATT: vfcmaddcph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfcmaddcph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x57,0x28,0x56,0x71,0x7f + +# ATT: vfcmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfcmaddcph ymm6 {k7} {z}, ymm5, dword ptr [edx - 512]{1to8} +0x62,0xf6,0x57,0xbf,0x56,0x72,0x80 + +# ATT: vfcmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfcmaddcph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x57,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfcmaddcph (%ecx){1to4}, %xmm5, %xmm6 +# INTEL: vfcmaddcph xmm6, xmm5, dword ptr [ecx]{1to4} +0x62,0xf6,0x57,0x18,0x56,0x31 + +# ATT: vfcmaddcph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfcmaddcph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x57,0x08,0x56,0x71,0x7f + +# ATT: vfcmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfcmaddcph xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]{1to4} +0x62,0xf6,0x57,0x9f,0x56,0x72,0x80 + +# ATT: vfcmulcph %ymm4, %ymm5, %ymm6 +# INTEL: vfcmulcph ymm6, ymm5, ymm4 +0x62,0xf6,0x57,0x28,0xd6,0xf4 + +# ATT: vfcmulcph %xmm4, %xmm5, %xmm6 +# INTEL: vfcmulcph xmm6, xmm5, xmm4 +0x62,0xf6,0x57,0x08,0xd6,0xf4 + +# ATT: vfcmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfcmulcph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x57,0x2f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfcmulcph (%ecx){1to8}, %ymm5, %ymm6 +# INTEL: vfcmulcph ymm6, ymm5, dword ptr [ecx]{1to8} +0x62,0xf6,0x57,0x38,0xd6,0x31 + +# ATT: vfcmulcph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfcmulcph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x57,0x28,0xd6,0x71,0x7f + +# ATT: vfcmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfcmulcph ymm6 {k7} {z}, ymm5, dword ptr [edx - 512]{1to8} +0x62,0xf6,0x57,0xbf,0xd6,0x72,0x80 + +# ATT: vfcmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfcmulcph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x57,0x0f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfcmulcph (%ecx){1to4}, %xmm5, %xmm6 +# INTEL: vfcmulcph xmm6, xmm5, dword ptr [ecx]{1to4} +0x62,0xf6,0x57,0x18,0xd6,0x31 + +# ATT: vfcmulcph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfcmulcph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x57,0x08,0xd6,0x71,0x7f + +# ATT: vfcmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfcmulcph xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]{1to4} +0x62,0xf6,0x57,0x9f,0xd6,0x72,0x80 + +# ATT: vfmaddcph %ymm4, %ymm5, %ymm6 +# INTEL: vfmaddcph ymm6, ymm5, ymm4 +0x62,0xf6,0x56,0x28,0x56,0xf4 + +# ATT: vfmaddcph %xmm4, %xmm5, %xmm6 +# INTEL: vfmaddcph xmm6, xmm5, xmm4 +0x62,0xf6,0x56,0x08,0x56,0xf4 + +# ATT: vfmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmaddcph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x56,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmaddcph (%ecx){1to8}, %ymm5, %ymm6 +# INTEL: vfmaddcph ymm6, ymm5, dword ptr [ecx]{1to8} +0x62,0xf6,0x56,0x38,0x56,0x31 + +# ATT: vfmaddcph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmaddcph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x56,0x28,0x56,0x71,0x7f + +# ATT: vfmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmaddcph ymm6 {k7} {z}, ymm5, dword ptr [edx - 512]{1to8} +0x62,0xf6,0x56,0xbf,0x56,0x72,0x80 + +# ATT: vfmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmaddcph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x56,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmaddcph (%ecx){1to4}, %xmm5, %xmm6 +# INTEL: vfmaddcph xmm6, xmm5, dword ptr [ecx]{1to4} +0x62,0xf6,0x56,0x18,0x56,0x31 + +# ATT: vfmaddcph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmaddcph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x56,0x08,0x56,0x71,0x7f + +# ATT: vfmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmaddcph xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]{1to4} +0x62,0xf6,0x56,0x9f,0x56,0x72,0x80 + +# ATT: vfmulcph %ymm4, %ymm5, %ymm6 +# INTEL: vfmulcph ymm6, ymm5, ymm4 +0x62,0xf6,0x56,0x28,0xd6,0xf4 + +# ATT: vfmulcph %xmm4, %xmm5, %xmm6 +# INTEL: vfmulcph xmm6, xmm5, xmm4 +0x62,0xf6,0x56,0x08,0xd6,0xf4 + +# ATT: vfmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmulcph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x56,0x2f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmulcph (%ecx){1to8}, %ymm5, %ymm6 +# INTEL: vfmulcph ymm6, ymm5, dword ptr [ecx]{1to8} +0x62,0xf6,0x56,0x38,0xd6,0x31 + +# ATT: vfmulcph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmulcph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x56,0x28,0xd6,0x71,0x7f + +# ATT: vfmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmulcph ymm6 {k7} {z}, ymm5, dword ptr [edx - 512]{1to8} +0x62,0xf6,0x56,0xbf,0xd6,0x72,0x80 + +# ATT: vfmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmulcph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x56,0x0f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmulcph (%ecx){1to4}, %xmm5, %xmm6 +# INTEL: vfmulcph xmm6, xmm5, dword ptr [ecx]{1to4} +0x62,0xf6,0x56,0x18,0xd6,0x31 + +# ATT: vfmulcph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmulcph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x56,0x08,0xd6,0x71,0x7f + +# ATT: vfmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmulcph xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]{1to4} +0x62,0xf6,0x56,0x9f,0xd6,0x72,0x80 diff --git a/llvm/test/MC/X86/avx512fp16-complex-fma.s b/llvm/test/MC/X86/avx512fp16-complex-fma.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx512fp16-complex-fma.s @@ -0,0 +1,324 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown %s > %t 2> %t.err +// RUN: FileCheck < %t %s +// RUN: FileCheck --check-prefix=CHECK-STDERR < %t.err %s + +// CHECK: vfcmaddcph %zmm24, %zmm23, %zmm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph %zmm24, %zmm23, %zmm24 + +// CHECK: vfcmaddcph {rn-sae}, %zmm24, %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph {rn-sae}, %zmm24, %zmm23, %zmm23 + +// CHECK: vfcmaddcph %zmm24, %zmm23, %zmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph %zmm24, %zmm23, %zmm24 {%k7} + +// CHECK: vfcmaddcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfcmaddcph 268435456(%rbp,%r14,8), %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph 268435456(%rbp,%r14,8), %zmm23, %zmm23 + +// CHECK: vfcmaddcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7} + +// CHECK: vfcmaddcph (%rip){1to16}, %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph (%rip){1to16}, %zmm23, %zmm23 + +// CHECK: vfcmaddcph -2048(,%rbp,2), %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph -2048(,%rbp,2), %zmm23, %zmm23 + +// CHECK: vfcmaddcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfcmaddcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfcmaddcsh %xmm24, %xmm23, %xmm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcsh %xmm24, %xmm23, %xmm24 + +// CHECK: vfcmaddcsh {rn-sae}, %xmm24, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcsh {rn-sae}, %xmm24, %xmm23, %xmm23 + +// CHECK: vfcmaddcsh %xmm24, %xmm23, %xmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcsh %xmm24, %xmm23, %xmm24 {%k7} + +// CHECK: vfcmaddcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfcmaddcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23 + +// CHECK: vfcmaddcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} + +// CHECK: vfcmaddcsh (%rip), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcsh (%rip), %xmm23, %xmm23 + +// CHECK: vfcmaddcsh -128(,%rbp,2), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcsh -128(,%rbp,2), %xmm23, %xmm23 + +// CHECK: vfcmaddcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfcmaddcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfcmulcph %zmm24, %zmm23, %zmm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph %zmm24, %zmm23, %zmm24 + +// CHECK: vfcmulcph {rn-sae}, %zmm24, %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph {rn-sae}, %zmm24, %zmm23, %zmm23 + +// CHECK: vfcmulcph %zmm24, %zmm23, %zmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph %zmm24, %zmm23, %zmm24 {%k7} + +// CHECK: vfcmulcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfcmulcph 268435456(%rbp,%r14,8), %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph 268435456(%rbp,%r14,8), %zmm23, %zmm23 + +// CHECK: vfcmulcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7} + +// CHECK: vfcmulcph (%rip){1to16}, %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph (%rip){1to16}, %zmm23, %zmm23 + +// CHECK: vfcmulcph -2048(,%rbp,2), %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph -2048(,%rbp,2), %zmm23, %zmm23 + +// CHECK: vfcmulcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfcmulcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfcmulcsh %xmm24, %xmm23, %xmm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcsh %xmm24, %xmm23, %xmm24 + +// CHECK: vfcmulcsh {rn-sae}, %xmm24, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcsh {rn-sae}, %xmm24, %xmm23, %xmm23 + +// CHECK: vfcmulcsh %xmm24, %xmm23, %xmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcsh %xmm24, %xmm23, %xmm24 {%k7} + +// CHECK: vfcmulcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfcmulcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23 + +// CHECK: vfcmulcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} + +// CHECK: vfcmulcsh (%rip), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcsh (%rip), %xmm23, %xmm23 + +// CHECK: vfcmulcsh -128(,%rbp,2), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcsh -128(,%rbp,2), %xmm23, %xmm23 + +// CHECK: vfcmulcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfcmulcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmaddcph %zmm24, %zmm23, %zmm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph %zmm24, %zmm23, %zmm24 + +// CHECK: vfmaddcph {rn-sae}, %zmm24, %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph {rn-sae}, %zmm24, %zmm23, %zmm23 + +// CHECK: vfmaddcph %zmm24, %zmm23, %zmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph %zmm24, %zmm23, %zmm24 {%k7} + +// CHECK: vfmaddcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfmaddcph 268435456(%rbp,%r14,8), %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph 268435456(%rbp,%r14,8), %zmm23, %zmm23 + +// CHECK: vfmaddcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7} + +// CHECK: vfmaddcph (%rip){1to16}, %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph (%rip){1to16}, %zmm23, %zmm23 + +// CHECK: vfmaddcph -2048(,%rbp,2), %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph -2048(,%rbp,2), %zmm23, %zmm23 + +// CHECK: vfmaddcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfmaddcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfmaddcsh %xmm24, %xmm23, %xmm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcsh %xmm24, %xmm23, %xmm24 + +// CHECK: vfmaddcsh {rn-sae}, %xmm24, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcsh {rn-sae}, %xmm24, %xmm23, %xmm23 + +// CHECK: vfmaddcsh %xmm24, %xmm23, %xmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcsh %xmm24, %xmm23, %xmm24 {%k7} + +// CHECK: vfmaddcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmaddcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23 + +// CHECK: vfmaddcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} + +// CHECK: vfmaddcsh (%rip), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcsh (%rip), %xmm23, %xmm23 + +// CHECK: vfmaddcsh -128(,%rbp,2), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcsh -128(,%rbp,2), %xmm23, %xmm23 + +// CHECK: vfmaddcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmaddcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmulcph %zmm24, %zmm23, %zmm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph %zmm24, %zmm23, %zmm24 + +// CHECK: vfmulcph {rn-sae}, %zmm24, %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph {rn-sae}, %zmm24, %zmm23, %zmm23 + +// CHECK: vfmulcph %zmm24, %zmm23, %zmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph %zmm24, %zmm23, %zmm24 {%k7} + +// CHECK: vfmulcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph {rz-sae}, %zmm24, %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfmulcph 268435456(%rbp,%r14,8), %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph 268435456(%rbp,%r14,8), %zmm23, %zmm23 + +// CHECK: vfmulcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph 291(%r8,%rax,4), %zmm23, %zmm23 {%k7} + +// CHECK: vfmulcph (%rip){1to16}, %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph (%rip){1to16}, %zmm23, %zmm23 + +// CHECK: vfmulcph -2048(,%rbp,2), %zmm23, %zmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph -2048(,%rbp,2), %zmm23, %zmm23 + +// CHECK: vfmulcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph 8128(%rcx), %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfmulcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph -512(%rdx){1to16}, %zmm23, %zmm23 {%k7} {z} + +// CHECK: vfmulcsh %xmm24, %xmm23, %xmm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcsh %xmm24, %xmm23, %xmm24 + +// CHECK: vfmulcsh {rn-sae}, %xmm24, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcsh {rn-sae}, %xmm24, %xmm23, %xmm23 + +// CHECK: vfmulcsh %xmm24, %xmm23, %xmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcsh %xmm24, %xmm23, %xmm24 {%k7} + +// CHECK: vfmulcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcsh {rz-sae}, %xmm24, %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmulcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcsh 268435456(%rbp,%r14,8), %xmm23, %xmm23 + +// CHECK: vfmulcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcsh 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} + +// CHECK: vfmulcsh (%rip), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcsh (%rip), %xmm23, %xmm23 + +// CHECK: vfmulcsh -128(,%rbp,2), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcsh -128(,%rbp,2), %xmm23, %xmm23 + +// CHECK: vfmulcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcsh 508(%rcx), %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmulcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcsh -512(%rdx), %xmm23, %xmm23 {%k7} {z} + diff --git a/llvm/test/MC/X86/avx512fp16-complex-fma_vl.s b/llvm/test/MC/X86/avx512fp16-complex-fma_vl.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx512fp16-complex-fma_vl.s @@ -0,0 +1,292 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown %s > %t 2> %t.err +// RUN: FileCheck < %t %s +// RUN: FileCheck --check-prefix=CHECK-STDERR < %t.err %s + +// CHECK: vfcmaddcph %ymm24, %ymm23, %ymm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph %ymm24, %ymm23, %ymm24 + +// CHECK: vfcmaddcph %ymm24, %ymm23, %ymm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph %ymm24, %ymm23, %ymm23 {%k7} + +// CHECK: vfcmaddcph %ymm24, %ymm23, %ymm24 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph %ymm24, %ymm23, %ymm24 {%k7} {z} + +// CHECK: vfcmaddcph %xmm24, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph %xmm24, %xmm23, %xmm23 + +// CHECK: vfcmaddcph %xmm24, %xmm23, %xmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph %xmm24, %xmm23, %xmm24 {%k7} + +// CHECK: vfcmaddcph %xmm24, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph %xmm24, %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfcmaddcph 268435456(%rbp,%r14,8), %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph 268435456(%rbp,%r14,8), %ymm23, %ymm23 + +// CHECK: vfcmaddcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7} + +// CHECK: vfcmaddcph (%rip){1to8}, %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph (%rip){1to8}, %ymm23, %ymm23 + +// CHECK: vfcmaddcph -1024(,%rbp,2), %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph -1024(,%rbp,2), %ymm23, %ymm23 + +// CHECK: vfcmaddcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z} + +// CHECK: vfcmaddcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z} + +// CHECK: vfcmaddcph 268435456(%rbp,%r14,8), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph 268435456(%rbp,%r14,8), %xmm23, %xmm23 + +// CHECK: vfcmaddcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} + +// CHECK: vfcmaddcph (%rip){1to4}, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph (%rip){1to4}, %xmm23, %xmm23 + +// CHECK: vfcmaddcph -512(,%rbp,2), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph -512(,%rbp,2), %xmm23, %xmm23 + +// CHECK: vfcmaddcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfcmaddcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmaddcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfcmulcph %ymm24, %ymm23, %ymm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph %ymm24, %ymm23, %ymm24 + +// CHECK: vfcmulcph %ymm24, %ymm23, %ymm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph %ymm24, %ymm23, %ymm23 {%k7} + +// CHECK: vfcmulcph %ymm24, %ymm23, %ymm24 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph %ymm24, %ymm23, %ymm24 {%k7} {z} + +// CHECK: vfcmulcph %xmm24, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph %xmm24, %xmm23, %xmm23 + +// CHECK: vfcmulcph %xmm24, %xmm23, %xmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph %xmm24, %xmm23, %xmm24 {%k7} + +// CHECK: vfcmulcph %xmm24, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph %xmm24, %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfcmulcph 268435456(%rbp,%r14,8), %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph 268435456(%rbp,%r14,8), %ymm23, %ymm23 + +// CHECK: vfcmulcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7} + +// CHECK: vfcmulcph (%rip){1to8}, %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph (%rip){1to8}, %ymm23, %ymm23 + +// CHECK: vfcmulcph -1024(,%rbp,2), %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph -1024(,%rbp,2), %ymm23, %ymm23 + +// CHECK: vfcmulcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z} + +// CHECK: vfcmulcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z} + +// CHECK: vfcmulcph 268435456(%rbp,%r14,8), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph 268435456(%rbp,%r14,8), %xmm23, %xmm23 + +// CHECK: vfcmulcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} + +// CHECK: vfcmulcph (%rip){1to4}, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph (%rip){1to4}, %xmm23, %xmm23 + +// CHECK: vfcmulcph -512(,%rbp,2), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph -512(,%rbp,2), %xmm23, %xmm23 + +// CHECK: vfcmulcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfcmulcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfcmulcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmaddcph %ymm24, %ymm23, %ymm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph %ymm24, %ymm23, %ymm24 + +// CHECK: vfmaddcph %ymm24, %ymm23, %ymm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph %ymm24, %ymm23, %ymm23 {%k7} + +// CHECK: vfmaddcph %ymm24, %ymm23, %ymm24 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph %ymm24, %ymm23, %ymm24 {%k7} {z} + +// CHECK: vfmaddcph %xmm24, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph %xmm24, %xmm23, %xmm23 + +// CHECK: vfmaddcph %xmm24, %xmm23, %xmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph %xmm24, %xmm23, %xmm24 {%k7} + +// CHECK: vfmaddcph %xmm24, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph %xmm24, %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmaddcph 268435456(%rbp,%r14,8), %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph 268435456(%rbp,%r14,8), %ymm23, %ymm23 + +// CHECK: vfmaddcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7} + +// CHECK: vfmaddcph (%rip){1to8}, %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph (%rip){1to8}, %ymm23, %ymm23 + +// CHECK: vfmaddcph -1024(,%rbp,2), %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph -1024(,%rbp,2), %ymm23, %ymm23 + +// CHECK: vfmaddcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z} + +// CHECK: vfmaddcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z} + +// CHECK: vfmaddcph 268435456(%rbp,%r14,8), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph 268435456(%rbp,%r14,8), %xmm23, %xmm23 + +// CHECK: vfmaddcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} + +// CHECK: vfmaddcph (%rip){1to4}, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph (%rip){1to4}, %xmm23, %xmm23 + +// CHECK: vfmaddcph -512(,%rbp,2), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph -512(,%rbp,2), %xmm23, %xmm23 + +// CHECK: vfmaddcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmaddcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmaddcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmulcph %ymm24, %ymm23, %ymm24 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph %ymm24, %ymm23, %ymm24 + +// CHECK: vfmulcph %ymm24, %ymm23, %ymm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph %ymm24, %ymm23, %ymm23 {%k7} + +// CHECK: vfmulcph %ymm24, %ymm23, %ymm24 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph %ymm24, %ymm23, %ymm24 {%k7} {z} + +// CHECK: vfmulcph %xmm24, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph %xmm24, %xmm23, %xmm23 + +// CHECK: vfmulcph %xmm24, %xmm23, %xmm24 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph %xmm24, %xmm23, %xmm24 {%k7} + +// CHECK: vfmulcph %xmm24, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph %xmm24, %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmulcph 268435456(%rbp,%r14,8), %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph 268435456(%rbp,%r14,8), %ymm23, %ymm23 + +// CHECK: vfmulcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph 291(%r8,%rax,4), %ymm23, %ymm23 {%k7} + +// CHECK: vfmulcph (%rip){1to8}, %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph (%rip){1to8}, %ymm23, %ymm23 + +// CHECK: vfmulcph -1024(,%rbp,2), %ymm23, %ymm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph -1024(,%rbp,2), %ymm23, %ymm23 + +// CHECK: vfmulcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph 4064(%rcx), %ymm23, %ymm23 {%k7} {z} + +// CHECK: vfmulcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph -512(%rdx){1to8}, %ymm23, %ymm23 {%k7} {z} + +// CHECK: vfmulcph 268435456(%rbp,%r14,8), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph 268435456(%rbp,%r14,8), %xmm23, %xmm23 + +// CHECK: vfmulcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph 291(%r8,%rax,4), %xmm23, %xmm23 {%k7} + +// CHECK: vfmulcph (%rip){1to4}, %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph (%rip){1to4}, %xmm23, %xmm23 + +// CHECK: vfmulcph -512(,%rbp,2), %xmm23, %xmm23 +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph -512(,%rbp,2), %xmm23, %xmm23 + +// CHECK: vfmulcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph 2032(%rcx), %xmm23, %xmm23 {%k7} {z} + +// CHECK: vfmulcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z} +// CHECK-STDERR: warning: Destination register should be distinct from source registers + vfmulcph -512(%rdx){1to4}, %xmm23, %xmm23 {%k7} {z} + diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s --- a/llvm/test/MC/X86/avx512fp16.s +++ b/llvm/test/MC/X86/avx512fp16.s @@ -2483,3 +2483,195 @@ // CHECK: vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} // CHECK: encoding: [0x62,0x66,0x15,0x87,0xbf,0x72,0x80] vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfcmaddcph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x17,0x40,0x56,0xf4] + vfcmaddcph %zmm28, %zmm29, %zmm30 + +// CHECK: vfcmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x17,0x10,0x56,0xf4] + vfcmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfcmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x17,0x47,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfcmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfcmaddcph (%r9){1to16}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x17,0x50,0x56,0x31] + vfcmaddcph (%r9){1to16}, %zmm29, %zmm30 + +// CHECK: vfcmaddcph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x17,0x40,0x56,0x71,0x7f] + vfcmaddcph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfcmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x17,0xd7,0x56,0x72,0x80] + vfcmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfcmaddcsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x17,0x00,0x57,0xf4] + vfcmaddcsh %xmm28, %xmm29, %xmm30 + +// CHECK: vfcmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x17,0x10,0x57,0xf4] + vfcmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfcmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x17,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfcmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfcmaddcsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x17,0x00,0x57,0x31] + vfcmaddcsh (%r9), %xmm29, %xmm30 + +// CHECK: vfcmaddcsh 508(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x17,0x00,0x57,0x71,0x7f] + vfcmaddcsh 508(%rcx), %xmm29, %xmm30 + +// CHECK: vfcmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x17,0x87,0x57,0x72,0x80] + vfcmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfcmulcph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x17,0x40,0xd6,0xf4] + vfcmulcph %zmm28, %zmm29, %zmm30 + +// CHECK: vfcmulcph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x17,0x10,0xd6,0xf4] + vfcmulcph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfcmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x17,0x47,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfcmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfcmulcph (%r9){1to16}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x17,0x50,0xd6,0x31] + vfcmulcph (%r9){1to16}, %zmm29, %zmm30 + +// CHECK: vfcmulcph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x17,0x40,0xd6,0x71,0x7f] + vfcmulcph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfcmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x17,0xd7,0xd6,0x72,0x80] + vfcmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfcmulcsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x17,0x00,0xd7,0xf4] + vfcmulcsh %xmm28, %xmm29, %xmm30 + +// CHECK: vfcmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x17,0x10,0xd7,0xf4] + vfcmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfcmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x17,0x07,0xd7,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfcmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfcmulcsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x17,0x00,0xd7,0x31] + vfcmulcsh (%r9), %xmm29, %xmm30 + +// CHECK: vfcmulcsh 508(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x17,0x00,0xd7,0x71,0x7f] + vfcmulcsh 508(%rcx), %xmm29, %xmm30 + +// CHECK: vfcmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x17,0x87,0xd7,0x72,0x80] + vfcmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfmaddcph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x16,0x40,0x56,0xf4] + vfmaddcph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x16,0x10,0x56,0xf4] + vfmaddcph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x16,0x47,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmaddcph (%r9){1to16}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x16,0x50,0x56,0x31] + vfmaddcph (%r9){1to16}, %zmm29, %zmm30 + +// CHECK: vfmaddcph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x16,0x40,0x56,0x71,0x7f] + vfmaddcph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x16,0xd7,0x56,0x72,0x80] + vfmaddcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmaddcsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x16,0x00,0x57,0xf4] + vfmaddcsh %xmm28, %xmm29, %xmm30 + +// CHECK: vfmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x16,0x10,0x57,0xf4] + vfmaddcsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x16,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfmaddcsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x16,0x00,0x57,0x31] + vfmaddcsh (%r9), %xmm29, %xmm30 + +// CHECK: vfmaddcsh 508(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x16,0x00,0x57,0x71,0x7f] + vfmaddcsh 508(%rcx), %xmm29, %xmm30 + +// CHECK: vfmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x16,0x87,0x57,0x72,0x80] + vfmaddcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfmulcph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x16,0x40,0xd6,0xf4] + vfmulcph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmulcph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x16,0x10,0xd6,0xf4] + vfmulcph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x16,0x47,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmulcph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmulcph (%r9){1to16}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x16,0x50,0xd6,0x31] + vfmulcph (%r9){1to16}, %zmm29, %zmm30 + +// CHECK: vfmulcph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x16,0x40,0xd6,0x71,0x7f] + vfmulcph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x16,0xd7,0xd6,0x72,0x80] + vfmulcph -512(%rdx){1to16}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmulcsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x16,0x00,0xd7,0xf4] + vfmulcsh %xmm28, %xmm29, %xmm30 + +// CHECK: vfmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x16,0x10,0xd7,0xf4] + vfmulcsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x16,0x07,0xd7,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmulcsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfmulcsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x16,0x00,0xd7,0x31] + vfmulcsh (%r9), %xmm29, %xmm30 + +// CHECK: vfmulcsh 508(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x16,0x00,0xd7,0x71,0x7f] + vfmulcsh 508(%rcx), %xmm29, %xmm30 + +// CHECK: vfmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x16,0x87,0xd7,0x72,0x80] + vfmulcsh -512(%rdx), %xmm29, %xmm30 {%k7} {z} diff --git a/llvm/test/MC/X86/avx512fp16vl.s b/llvm/test/MC/X86/avx512fp16vl.s --- a/llvm/test/MC/X86/avx512fp16vl.s +++ b/llvm/test/MC/X86/avx512fp16vl.s @@ -2211,3 +2211,163 @@ // CHECK: vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} // CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xbe,0x72,0x80] vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfcmaddcph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x28,0x56,0xf4] + vfcmaddcph %ymm4, %ymm5, %ymm6 + +// CHECK: vfcmaddcph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x08,0x56,0xf4] + vfcmaddcph %xmm4, %xmm5, %xmm6 + +// CHECK: vfcmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x57,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfcmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfcmaddcph (%ecx){1to8}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x38,0x56,0x31] + vfcmaddcph (%ecx){1to8}, %ymm5, %ymm6 + +// CHECK: vfcmaddcph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x28,0x56,0x71,0x7f] + vfcmaddcph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfcmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x57,0xbf,0x56,0x72,0x80] + vfcmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfcmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x57,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfcmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfcmaddcph (%ecx){1to4}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x18,0x56,0x31] + vfcmaddcph (%ecx){1to4}, %xmm5, %xmm6 + +// CHECK: vfcmaddcph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x08,0x56,0x71,0x7f] + vfcmaddcph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfcmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x57,0x9f,0x56,0x72,0x80] + vfcmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfcmulcph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x28,0xd6,0xf4] + vfcmulcph %ymm4, %ymm5, %ymm6 + +// CHECK: vfcmulcph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x08,0xd6,0xf4] + vfcmulcph %xmm4, %xmm5, %xmm6 + +// CHECK: vfcmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x57,0x2f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfcmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfcmulcph (%ecx){1to8}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x38,0xd6,0x31] + vfcmulcph (%ecx){1to8}, %ymm5, %ymm6 + +// CHECK: vfcmulcph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x28,0xd6,0x71,0x7f] + vfcmulcph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfcmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x57,0xbf,0xd6,0x72,0x80] + vfcmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfcmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x57,0x0f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfcmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfcmulcph (%ecx){1to4}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x18,0xd6,0x31] + vfcmulcph (%ecx){1to4}, %xmm5, %xmm6 + +// CHECK: vfcmulcph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x57,0x08,0xd6,0x71,0x7f] + vfcmulcph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfcmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x57,0x9f,0xd6,0x72,0x80] + vfcmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmaddcph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x28,0x56,0xf4] + vfmaddcph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmaddcph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x08,0x56,0xf4] + vfmaddcph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x56,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmaddcph (%ecx){1to8}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x38,0x56,0x31] + vfmaddcph (%ecx){1to8}, %ymm5, %ymm6 + +// CHECK: vfmaddcph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x28,0x56,0x71,0x7f] + vfmaddcph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x56,0xbf,0x56,0x72,0x80] + vfmaddcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x56,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmaddcph (%ecx){1to4}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x18,0x56,0x31] + vfmaddcph (%ecx){1to4}, %xmm5, %xmm6 + +// CHECK: vfmaddcph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x08,0x56,0x71,0x7f] + vfmaddcph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x56,0x9f,0x56,0x72,0x80] + vfmaddcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmulcph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x28,0xd6,0xf4] + vfmulcph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmulcph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x08,0xd6,0xf4] + vfmulcph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x56,0x2f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmulcph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmulcph (%ecx){1to8}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x38,0xd6,0x31] + vfmulcph (%ecx){1to8}, %ymm5, %ymm6 + +// CHECK: vfmulcph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x28,0xd6,0x71,0x7f] + vfmulcph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x56,0xbf,0xd6,0x72,0x80] + vfmulcph -512(%edx){1to8}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x56,0x0f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmulcph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmulcph (%ecx){1to4}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x18,0xd6,0x31] + vfmulcph (%ecx){1to4}, %xmm5, %xmm6 + +// CHECK: vfmulcph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x56,0x08,0xd6,0x71,0x7f] + vfmulcph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x56,0x9f,0xd6,0x72,0x80] + vfmulcph -512(%edx){1to4}, %xmm5, %xmm6 {%k7} {z} diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s --- a/llvm/test/MC/X86/intel-syntax-avx512fp16.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s @@ -2355,3 +2355,195 @@ // CHECK: vfnmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] // CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xbf,0x72,0x80] vfnmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfcmaddcph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x57,0x48,0x56,0xf4] + vfcmaddcph zmm6, zmm5, zmm4 + +// CHECK: vfcmaddcph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x57,0x18,0x56,0xf4] + vfcmaddcph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfcmaddcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x57,0x4f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfcmaddcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfcmaddcph zmm6, zmm5, dword ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf6,0x57,0x58,0x56,0x31] + vfcmaddcph zmm6, zmm5, dword ptr [ecx]{1to16} + +// CHECK: vfcmaddcph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x57,0x48,0x56,0x71,0x7f] + vfcmaddcph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfcmaddcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf6,0x57,0xdf,0x56,0x72,0x80] + vfcmaddcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16} + +// CHECK: vfcmaddcsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x57,0x08,0x57,0xf4] + vfcmaddcsh xmm6, xmm5, xmm4 + +// CHECK: vfcmaddcsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x57,0x18,0x57,0xf4] + vfcmaddcsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfcmaddcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x57,0x0f,0x57,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfcmaddcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456] + +// CHECK: vfcmaddcsh xmm6, xmm5, dword ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x57,0x08,0x57,0x31] + vfcmaddcsh xmm6, xmm5, dword ptr [ecx] + +// CHECK: vfcmaddcsh xmm6, xmm5, dword ptr [ecx + 508] +// CHECK: encoding: [0x62,0xf6,0x57,0x08,0x57,0x71,0x7f] + vfcmaddcsh xmm6, xmm5, dword ptr [ecx + 508] + +// CHECK: vfcmaddcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512] +// CHECK: encoding: [0x62,0xf6,0x57,0x8f,0x57,0x72,0x80] + vfcmaddcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512] + +// CHECK: vfcmulcph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x57,0x48,0xd6,0xf4] + vfcmulcph zmm6, zmm5, zmm4 + +// CHECK: vfcmulcph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x57,0x18,0xd6,0xf4] + vfcmulcph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfcmulcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x57,0x4f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfcmulcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfcmulcph zmm6, zmm5, dword ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf6,0x57,0x58,0xd6,0x31] + vfcmulcph zmm6, zmm5, dword ptr [ecx]{1to16} + +// CHECK: vfcmulcph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x57,0x48,0xd6,0x71,0x7f] + vfcmulcph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfcmulcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf6,0x57,0xdf,0xd6,0x72,0x80] + vfcmulcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16} + +// CHECK: vfcmulcsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x57,0x08,0xd7,0xf4] + vfcmulcsh xmm6, xmm5, xmm4 + +// CHECK: vfcmulcsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x57,0x18,0xd7,0xf4] + vfcmulcsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfcmulcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x57,0x0f,0xd7,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfcmulcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456] + +// CHECK: vfcmulcsh xmm6, xmm5, dword ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x57,0x08,0xd7,0x31] + vfcmulcsh xmm6, xmm5, dword ptr [ecx] + +// CHECK: vfcmulcsh xmm6, xmm5, dword ptr [ecx + 508] +// CHECK: encoding: [0x62,0xf6,0x57,0x08,0xd7,0x71,0x7f] + vfcmulcsh xmm6, xmm5, dword ptr [ecx + 508] + +// CHECK: vfcmulcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512] +// CHECK: encoding: [0x62,0xf6,0x57,0x8f,0xd7,0x72,0x80] + vfcmulcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512] + +// CHECK: vfmaddcph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x56,0x48,0x56,0xf4] + vfmaddcph zmm6, zmm5, zmm4 + +// CHECK: vfmaddcph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x56,0x18,0x56,0xf4] + vfmaddcph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmaddcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x56,0x4f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmaddcph zmm6, zmm5, dword ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf6,0x56,0x58,0x56,0x31] + vfmaddcph zmm6, zmm5, dword ptr [ecx]{1to16} + +// CHECK: vfmaddcph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x56,0x48,0x56,0x71,0x7f] + vfmaddcph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmaddcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf6,0x56,0xdf,0x56,0x72,0x80] + vfmaddcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16} + +// CHECK: vfmaddcsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x56,0x08,0x57,0xf4] + vfmaddcsh xmm6, xmm5, xmm4 + +// CHECK: vfmaddcsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x56,0x18,0x57,0xf4] + vfmaddcsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfmaddcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x56,0x0f,0x57,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmaddcsh xmm6, xmm5, dword ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x56,0x08,0x57,0x31] + vfmaddcsh xmm6, xmm5, dword ptr [ecx] + +// CHECK: vfmaddcsh xmm6, xmm5, dword ptr [ecx + 508] +// CHECK: encoding: [0x62,0xf6,0x56,0x08,0x57,0x71,0x7f] + vfmaddcsh xmm6, xmm5, dword ptr [ecx + 508] + +// CHECK: vfmaddcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512] +// CHECK: encoding: [0x62,0xf6,0x56,0x8f,0x57,0x72,0x80] + vfmaddcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512] + +// CHECK: vfmulcph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x56,0x48,0xd6,0xf4] + vfmulcph zmm6, zmm5, zmm4 + +// CHECK: vfmulcph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x56,0x18,0xd6,0xf4] + vfmulcph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmulcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x56,0x4f,0xd6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmulcph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmulcph zmm6, zmm5, dword ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf6,0x56,0x58,0xd6,0x31] + vfmulcph zmm6, zmm5, dword ptr [ecx]{1to16} + +// CHECK: vfmulcph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x56,0x48,0xd6,0x71,0x7f] + vfmulcph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmulcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf6,0x56,0xdf,0xd6,0x72,0x80] + vfmulcph zmm6 {k7} {z}, zmm5, dword ptr [edx - 512]{1to16} + +// CHECK: vfmulcsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x56,0x08,0xd7,0xf4] + vfmulcsh xmm6, xmm5, xmm4 + +// CHECK: vfmulcsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x56,0x18,0xd7,0xf4] + vfmulcsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfmulcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x56,0x0f,0xd7,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmulcsh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmulcsh xmm6, xmm5, dword ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x56,0x08,0xd7,0x31] + vfmulcsh xmm6, xmm5, dword ptr [ecx] + +// CHECK: vfmulcsh xmm6, xmm5, dword ptr [ecx + 508] +// CHECK: encoding: [0x62,0xf6,0x56,0x08,0xd7,0x71,0x7f] + vfmulcsh xmm6, xmm5, dword ptr [ecx + 508] + +// CHECK: vfmulcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512] +// CHECK: encoding: [0x62,0xf6,0x56,0x8f,0xd7,0x72,0x80] + vfmulcsh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512] diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s --- a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s @@ -2211,3 +2211,163 @@ // CHECK: vfnmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} // CHECK: encoding: [0x62,0x66,0x15,0x97,0xbe,0x72,0x80] vfnmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfcmaddcph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x17,0x20,0x56,0xf4] + vfcmaddcph ymm30, ymm29, ymm28 + +// CHECK: vfcmaddcph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x17,0x00,0x56,0xf4] + vfcmaddcph xmm30, xmm29, xmm28 + +// CHECK: vfcmaddcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x17,0x27,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfcmaddcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfcmaddcph ymm30, ymm29, dword ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x17,0x30,0x56,0x31] + vfcmaddcph ymm30, ymm29, dword ptr [r9]{1to8} + +// CHECK: vfcmaddcph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x17,0x20,0x56,0x71,0x7f] + vfcmaddcph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfcmaddcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8} +// CHECK: encoding: [0x62,0x66,0x17,0xb7,0x56,0x72,0x80] + vfcmaddcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8} + +// CHECK: vfcmaddcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x17,0x07,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfcmaddcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfcmaddcph xmm30, xmm29, dword ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x46,0x17,0x10,0x56,0x31] + vfcmaddcph xmm30, xmm29, dword ptr [r9]{1to4} + +// CHECK: vfcmaddcph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x17,0x00,0x56,0x71,0x7f] + vfcmaddcph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfcmaddcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4} +// CHECK: encoding: [0x62,0x66,0x17,0x97,0x56,0x72,0x80] + vfcmaddcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4} + +// CHECK: vfcmulcph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x17,0x20,0xd6,0xf4] + vfcmulcph ymm30, ymm29, ymm28 + +// CHECK: vfcmulcph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x17,0x00,0xd6,0xf4] + vfcmulcph xmm30, xmm29, xmm28 + +// CHECK: vfcmulcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x17,0x27,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfcmulcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfcmulcph ymm30, ymm29, dword ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x17,0x30,0xd6,0x31] + vfcmulcph ymm30, ymm29, dword ptr [r9]{1to8} + +// CHECK: vfcmulcph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x17,0x20,0xd6,0x71,0x7f] + vfcmulcph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfcmulcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8} +// CHECK: encoding: [0x62,0x66,0x17,0xb7,0xd6,0x72,0x80] + vfcmulcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8} + +// CHECK: vfcmulcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x17,0x07,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfcmulcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfcmulcph xmm30, xmm29, dword ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x46,0x17,0x10,0xd6,0x31] + vfcmulcph xmm30, xmm29, dword ptr [r9]{1to4} + +// CHECK: vfcmulcph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x17,0x00,0xd6,0x71,0x7f] + vfcmulcph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfcmulcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4} +// CHECK: encoding: [0x62,0x66,0x17,0x97,0xd6,0x72,0x80] + vfcmulcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4} + +// CHECK: vfmaddcph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x16,0x20,0x56,0xf4] + vfmaddcph ymm30, ymm29, ymm28 + +// CHECK: vfmaddcph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x16,0x00,0x56,0xf4] + vfmaddcph xmm30, xmm29, xmm28 + +// CHECK: vfmaddcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x16,0x27,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmaddcph ymm30, ymm29, dword ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x16,0x30,0x56,0x31] + vfmaddcph ymm30, ymm29, dword ptr [r9]{1to8} + +// CHECK: vfmaddcph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x16,0x20,0x56,0x71,0x7f] + vfmaddcph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmaddcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8} +// CHECK: encoding: [0x62,0x66,0x16,0xb7,0x56,0x72,0x80] + vfmaddcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8} + +// CHECK: vfmaddcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x16,0x07,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmaddcph xmm30, xmm29, dword ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x46,0x16,0x10,0x56,0x31] + vfmaddcph xmm30, xmm29, dword ptr [r9]{1to4} + +// CHECK: vfmaddcph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x16,0x00,0x56,0x71,0x7f] + vfmaddcph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmaddcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4} +// CHECK: encoding: [0x62,0x66,0x16,0x97,0x56,0x72,0x80] + vfmaddcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4} + +// CHECK: vfmulcph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x16,0x20,0xd6,0xf4] + vfmulcph ymm30, ymm29, ymm28 + +// CHECK: vfmulcph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x16,0x00,0xd6,0xf4] + vfmulcph xmm30, xmm29, xmm28 + +// CHECK: vfmulcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x16,0x27,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmulcph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmulcph ymm30, ymm29, dword ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x16,0x30,0xd6,0x31] + vfmulcph ymm30, ymm29, dword ptr [r9]{1to8} + +// CHECK: vfmulcph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x16,0x20,0xd6,0x71,0x7f] + vfmulcph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmulcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8} +// CHECK: encoding: [0x62,0x66,0x16,0xb7,0xd6,0x72,0x80] + vfmulcph ymm30 {k7} {z}, ymm29, dword ptr [rdx - 512]{1to8} + +// CHECK: vfmulcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x16,0x07,0xd6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmulcph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmulcph xmm30, xmm29, dword ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x46,0x16,0x10,0xd6,0x31] + vfmulcph xmm30, xmm29, dword ptr [r9]{1to4} + +// CHECK: vfmulcph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x16,0x00,0xd6,0x71,0x7f] + vfmulcph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmulcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4} +// CHECK: encoding: [0x62,0x66,0x16,0x97,0xd6,0x72,0x80] + vfmulcph xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]{1to4}