diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1850,6 +1850,29 @@ TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_128, "vV4iV4iUc*Uc*", "nV:128:", "avx512vp2intersect,avx512vl") // AVX512 fp16 intrinsics +TARGET_BUILTIN(__builtin_ia32_vcomish, "iV8xV8xIiIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_addph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_subph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_mulph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_divph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_maxph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_minph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") + +TARGET_BUILTIN(__builtin_ia32_minph256, "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_minph128, "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_maxph256, "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_maxph128, "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl") + +TARGET_BUILTIN(__builtin_ia32_addsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_divsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_mulsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_subsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_maxsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_minsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_cmpph512_mask, "UiV32xV32xIiUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_cmpph256_mask, "UsV16xV16xIiUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_cmpph128_mask, "UcV8xV8xIiUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_cmpsh_mask, "UcV8xV8xIiUcIi", "ncV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_loadsh128_mask, "V8xV8x*V8xUc", "nV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_storesh128_mask, "vV8x*V8xUc", "nV:128:", "avx512fp16") @@ -1886,12 +1909,24 @@ TARGET_BUILTIN(__builtin_ia32_reduce_and_q512, "OiV8Oi", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fadd_pd512, "ddV8d", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ps512, "ffV16f", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph512, "xxV32x", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_reduce_fmax_pd512, "dV8d", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ps512, "fV16f", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph512, "xV32x", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_reduce_fmin_pd512, "dV8d", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ps512, "fV16f", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph512, "xV32x", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_reduce_fmul_pd512, "ddV8d", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ps512, "ffV16f", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph512, "xxV32x", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_reduce_mul_d512, "iV16i", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_mul_q512, "OiV8Oi", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_or_d512, "iV16i", "ncV:512:", "avx512f") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14137,28 +14137,40 @@ return Builder.CreateCall(F, {Ops[0]}); } case X86::BI__builtin_ia32_reduce_fadd_pd512: - case X86::BI__builtin_ia32_reduce_fadd_ps512: { + case X86::BI__builtin_ia32_reduce_fadd_ps512: + case X86::BI__builtin_ia32_reduce_fadd_ph512: + case X86::BI__builtin_ia32_reduce_fadd_ph256: + case X86::BI__builtin_ia32_reduce_fadd_ph128: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType()); Builder.getFastMathFlags().setAllowReassoc(); return Builder.CreateCall(F, {Ops[0], Ops[1]}); } case X86::BI__builtin_ia32_reduce_fmul_pd512: - case X86::BI__builtin_ia32_reduce_fmul_ps512: { + case X86::BI__builtin_ia32_reduce_fmul_ps512: + case X86::BI__builtin_ia32_reduce_fmul_ph512: + case X86::BI__builtin_ia32_reduce_fmul_ph256: + case X86::BI__builtin_ia32_reduce_fmul_ph128: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType()); Builder.getFastMathFlags().setAllowReassoc(); return Builder.CreateCall(F, {Ops[0], Ops[1]}); } case X86::BI__builtin_ia32_reduce_fmax_pd512: - case X86::BI__builtin_ia32_reduce_fmax_ps512: { + case X86::BI__builtin_ia32_reduce_fmax_ps512: + case X86::BI__builtin_ia32_reduce_fmax_ph512: + case X86::BI__builtin_ia32_reduce_fmax_ph256: + case X86::BI__builtin_ia32_reduce_fmax_ph128: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType()); Builder.getFastMathFlags().setNoNaNs(); return Builder.CreateCall(F, {Ops[0]}); } case X86::BI__builtin_ia32_reduce_fmin_pd512: - case X86::BI__builtin_ia32_reduce_fmin_ps512: { + case X86::BI__builtin_ia32_reduce_fmin_ps512: + case X86::BI__builtin_ia32_reduce_fmin_ph512: + case X86::BI__builtin_ia32_reduce_fmin_ph256: + case X86::BI__builtin_ia32_reduce_fmin_ph128: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType()); Builder.getFastMathFlags().setNoNaNs(); @@ -14422,6 +14434,9 @@ case X86::BI__builtin_ia32_cmpordps: case X86::BI__builtin_ia32_cmpordpd: return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false); + case X86::BI__builtin_ia32_cmpph128_mask: + case X86::BI__builtin_ia32_cmpph256_mask: + case X86::BI__builtin_ia32_cmpph512_mask: case X86::BI__builtin_ia32_cmpps128_mask: case X86::BI__builtin_ia32_cmpps256_mask: case X86::BI__builtin_ia32_cmpps512_mask: diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -269,10 +269,539 @@ 29, 30, 31); } +#define _mm_comi_round_sh(A, B, P, R) \ + __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R)) + +#define _mm_comi_sh(A, B, pred) \ + _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION) + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A + (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_add_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_add_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_add_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_add_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A - (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_sub_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_sub_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_sub_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_sub_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A * (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_mul_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_mul_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_mul_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_mul_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A / (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_div_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_div_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_div_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_div_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_min_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_min_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_min_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_min_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_max_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_max_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_max_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_max_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) { return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A); } +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A, + __m128h __B) { + __A[0] += __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_add_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_add_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_add_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_addsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_add_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_addsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_add_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_addsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A, + __m128h __B) { + __A[0] -= __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_sub_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_sub_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_sub_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_subsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_sub_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_subsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_sub_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_subsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A, + __m128h __B) { + __A[0] *= __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_mul_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_mul_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_mul_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_mulsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_mul_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_mulsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_mul_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_mulsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A, + __m128h __B) { + __A[0] /= __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_div_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_div_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_div_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_divsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_div_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_divsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_div_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_divsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_min_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_minsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_min_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_minsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_min_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_minsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_max_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_maxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_max_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_maxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_max_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_maxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_cmp_round_ph_mask(A, B, P, R) \ + ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(P), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \ + ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(P), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_cmp_ph_mask(A, B, P) \ + _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_cmp_ph_mask(U, A, B, P) \ + _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) + +#define _mm_cmp_round_sh_mask(X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), (int)(P), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), (int)(P), \ + (__mmask8)(M), (int)(R))) + +#define _mm_cmp_sh_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpsh_mask( \ + (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_sh_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpsh_mask( \ + (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \ + _MM_FROUND_CUR_DIRECTION)) // loads with vmovsh: static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) { struct __mm_load_sh_struct { @@ -418,6 +947,26 @@ return __b[0]; } +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_add_ph(__m512h __W) { + return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_mul_ph(__m512h __W) { + return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_ph(__m512h __V) { + return __builtin_ia32_reduce_fmax_ph512(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_ph(__m512h __V) { + return __builtin_ia32_reduce_fmin_ph512(__V); +} + static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W, diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -69,6 +69,240 @@ _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \ (h7), (h6), (h5), (h4), (h3), (h2), (h1)) +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A + (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A + (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A - (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A - (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A * (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A * (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A / (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A / (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)_mm_setzero_ph()); +} + static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) { return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A); } @@ -77,6 +311,22 @@ return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); } +#define _mm256_cmp_ph_mask(a, b, p) \ + ((__mmask16)__builtin_ia32_cmpph256_mask( \ + (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1)) + +#define _mm256_mask_cmp_ph_mask(m, a, b, p) \ + ((__mmask16)__builtin_ia32_cmpph256_mask( \ + (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m))) + +#define _mm_cmp_ph_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmpph128_mask( \ + (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1)) + +#define _mm_mask_cmp_ph_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmpph128_mask( \ + (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m))) + static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { @@ -112,6 +362,46 @@ return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); } +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_add_ph(__m256h __W) { + return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_mul_ph(__m256h __W) { + return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_max_ph(__m256h __V) { + return __builtin_ia32_reduce_fmax_ph256(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_min_ph(__m256h __V) { + return __builtin_ia32_reduce_fmin_ph256(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_add_ph(__m128h __W) { + return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_mul_ph(__m128h __W) { + return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_max_ph(__m128h __V) { + return __builtin_ia32_reduce_fmax_ph128(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_min_ph(__m128h __V) { + return __builtin_ia32_reduce_fmin_ph128(__V); +} + #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3884,6 +3884,8 @@ case X86::BI__builtin_ia32_maxps512: case X86::BI__builtin_ia32_minpd512: case X86::BI__builtin_ia32_minps512: + case X86::BI__builtin_ia32_maxph512: + case X86::BI__builtin_ia32_minph512: ArgNum = 2; break; case X86::BI__builtin_ia32_cvtps2pd512_mask: @@ -3905,6 +3907,7 @@ case X86::BI__builtin_ia32_rsqrt28ps_mask: case X86::BI__builtin_ia32_vcomisd: case X86::BI__builtin_ia32_vcomiss: + case X86::BI__builtin_ia32_vcomish: case X86::BI__builtin_ia32_vcvtph2ps512_mask: ArgNum = 3; break; @@ -3912,6 +3915,7 @@ case X86::BI__builtin_ia32_cmpps512_mask: case X86::BI__builtin_ia32_cmpsd_mask: case X86::BI__builtin_ia32_cmpss_mask: + case X86::BI__builtin_ia32_cmpsh_mask: case X86::BI__builtin_ia32_cvtss2sd_round_mask: case X86::BI__builtin_ia32_getexpsd128_round_mask: case X86::BI__builtin_ia32_getexpss128_round_mask: @@ -3919,8 +3923,10 @@ case X86::BI__builtin_ia32_getmantps512_mask: case X86::BI__builtin_ia32_maxsd_round_mask: case X86::BI__builtin_ia32_maxss_round_mask: + case X86::BI__builtin_ia32_maxsh_round_mask: case X86::BI__builtin_ia32_minsd_round_mask: case X86::BI__builtin_ia32_minss_round_mask: + case X86::BI__builtin_ia32_minsh_round_mask: case X86::BI__builtin_ia32_rcp28sd_round_mask: case X86::BI__builtin_ia32_rcp28ss_round_mask: case X86::BI__builtin_ia32_reducepd512_mask: @@ -3964,6 +3970,10 @@ ArgNum = 1; HasRC = true; break; + case X86::BI__builtin_ia32_addph512: + case X86::BI__builtin_ia32_divph512: + case X86::BI__builtin_ia32_mulph512: + case X86::BI__builtin_ia32_subph512: case X86::BI__builtin_ia32_addpd512: case X86::BI__builtin_ia32_addps512: case X86::BI__builtin_ia32_divpd512: @@ -3999,12 +4009,16 @@ ArgNum = 3; HasRC = true; break; + case X86::BI__builtin_ia32_addsh_round_mask: case X86::BI__builtin_ia32_addss_round_mask: case X86::BI__builtin_ia32_addsd_round_mask: + case X86::BI__builtin_ia32_divsh_round_mask: case X86::BI__builtin_ia32_divss_round_mask: case X86::BI__builtin_ia32_divsd_round_mask: + case X86::BI__builtin_ia32_mulsh_round_mask: case X86::BI__builtin_ia32_mulss_round_mask: case X86::BI__builtin_ia32_mulsd_round_mask: + case X86::BI__builtin_ia32_subsh_round_mask: case X86::BI__builtin_ia32_subss_round_mask: case X86::BI__builtin_ia32_subsd_round_mask: case X86::BI__builtin_ia32_scalefpd512_mask: diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c --- a/clang/test/CodeGen/X86/avx512fp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -337,12 +337,1055 @@ return _mm512_zextph256_ph512(__a); } +int test_mm_comi_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comi_round_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 8) + return _mm_comi_round_sh(__A, __B, 0, _MM_FROUND_NO_EXC); +} + +int test_mm_comi_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comi_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 4) + return _mm_comi_sh(__A, __B, 0); +} + +int test_mm_comieq_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comieq_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 16, i32 4) + return _mm_comieq_sh(__A, __B); +} + +int test_mm_comilt_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comilt_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 1, i32 4) + return _mm_comilt_sh(__A, __B); +} + +int test_mm_comile_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comile_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 2, i32 4) + return _mm_comile_sh(__A, __B); +} + +int test_mm_comigt_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comigt_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 14, i32 4) + return _mm_comigt_sh(__A, __B); +} + +int test_mm_comige_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comige_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 13, i32 4) + return _mm_comige_sh(__A, __B); +} + +int test_mm_comineq_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comineq_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 20, i32 4) + return _mm_comineq_sh(__A, __B); +} + +int test_mm_ucomieq_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomieq_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 4) + return _mm_ucomieq_sh(__A, __B); +} + +int test_mm_ucomilt_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomilt_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 17, i32 4) + return _mm_ucomilt_sh(__A, __B); +} + +int test_mm_ucomile_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomile_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 18, i32 4) + return _mm_ucomile_sh(__A, __B); +} + +int test_mm_ucomigt_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomigt_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 30, i32 4) + return _mm_ucomigt_sh(__A, __B); +} + +int test_mm_ucomige_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomige_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 29, i32 4) + return _mm_ucomige_sh(__A, __B); +} + +int test_mm_ucomineq_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomineq_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 4, i32 4) + return _mm_ucomineq_sh(__A, __B); +} + +__m512h test_mm512_add_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_add_ph + // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}} + return _mm512_add_ph(__A, __B); +} + +__m512h test_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_add_ph + // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_add_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_add_ph + // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_add_ph(__U, __A, __B); +} + +__m512h test_mm512_add_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_add_round_ph + // CHECK: @llvm.x86.avx512fp16.add.ph.512 + return _mm512_add_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_mask_add_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_add_round_ph + // CHECK: @llvm.x86.avx512fp16.add.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_add_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_maskz_add_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_add_round_ph + // CHECK: @llvm.x86.avx512fp16.add.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_add_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_sub_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_sub_ph + // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}} + return _mm512_sub_ph(__A, __B); +} + +__m512h test_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_sub_ph + // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_sub_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_sub_ph + // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_sub_ph(__U, __A, __B); +} + +__m512h test_mm512_sub_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_sub_round_ph + // CHECK: @llvm.x86.avx512fp16.sub.ph.512 + return _mm512_sub_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_mask_sub_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_sub_round_ph + // CHECK: @llvm.x86.avx512fp16.sub.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_sub_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_maskz_sub_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_sub_round_ph + // CHECK: @llvm.x86.avx512fp16.sub.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_sub_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mul_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mul_ph + // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}} + return _mm512_mul_ph(__A, __B); +} + +__m512h test_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_mul_ph + // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_mul_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_mul_ph + // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_mul_ph(__U, __A, __B); +} + +__m512h test_mm512_mul_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mul_round_ph + // CHECK: @llvm.x86.avx512fp16.mul.ph.512 + return _mm512_mul_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_mask_mul_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_mul_round_ph + // CHECK: @llvm.x86.avx512fp16.mul.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_mul_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_maskz_mul_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_mul_round_ph + // CHECK: @llvm.x86.avx512fp16.mul.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_mul_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_div_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_div_ph + // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}} + return _mm512_div_ph(__A, __B); +} + +__m512h test_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_div_ph + // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_div_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_div_ph + // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_div_ph(__U, __A, __B); +} + +__m512h test_mm512_div_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_div_round_ph + // CHECK: @llvm.x86.avx512fp16.div.ph.512 + return _mm512_div_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_mask_div_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_div_round_ph + // CHECK: @llvm.x86.avx512fp16.div.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_div_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_maskz_div_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_div_round_ph + // CHECK: @llvm.x86.avx512fp16.div.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_div_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_min_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + return _mm512_min_ph(__A, __B); +} + +__m512h test_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_min_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_min_ph(__U, __A, __B); +} + +__m512h test_mm512_min_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_min_round_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + return _mm512_min_round_ph(__A, __B, _MM_FROUND_NO_EXC); +} +__m512h test_mm512_mask_min_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_min_round_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_min_round_ph(__W, __U, __A, __B, _MM_FROUND_NO_EXC); +} +__m512h test_mm512_maskz_min_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_min_round_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_min_round_ph(__U, __A, __B, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_max_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + + return _mm512_max_ph(__A, __B); +} + +__m512h test_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_max_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_max_ph(__U, __A, __B); +} + +__m512h test_mm512_max_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_max_round_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + return _mm512_max_round_ph(__A, __B, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_max_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_max_round_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_max_round_ph(__W, __U, __A, __B, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_max_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_max_round_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_max_round_ph(__U, __A, __B, _MM_FROUND_NO_EXC); +} + __m512h test_mm512_abs_ph(__m512h a) { // CHECK-LABEL: @test_mm512_abs_ph // CHECK: and <16 x i32> return _mm512_abs_ph(a); } +__m128h test_mm_add_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_add_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round + return _mm_add_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_add_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_add_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round + return _mm_mask_add_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_maskz_add_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_add_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round + return _mm_maskz_add_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_add_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_add_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_mask_add_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_add_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_maskz_add_sh(__U, __A, __B); +} + +__m128h test_mm_add_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_add_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + return _mm_add_sh(__A, __B); +} + +__m128h test_mm_sub_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_sub_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round + return _mm_sub_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_sub_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_sub_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round + return _mm_mask_sub_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_maskz_sub_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_sub_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round + return _mm_maskz_sub_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_sub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_sub_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_mask_sub_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_sub_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_maskz_sub_sh(__U, __A, __B); +} + +__m128h test_mm_sub_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_sub_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + return _mm_sub_sh(__A, __B); +} + +__m128h test_mm_mul_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mul_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round + return _mm_mul_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_mul_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_mul_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round + return _mm_mask_mul_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_maskz_mul_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_mul_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round + return _mm_maskz_mul_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_mul_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_mul_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_mask_mul_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_mul_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_maskz_mul_sh(__U, __A, __B); +} + +__m128h test_mm_mul_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mul_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + return _mm_mul_sh(__A, __B); +} + +__m128h test_mm_div_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_div_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round + return _mm_div_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_div_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_div_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round + return _mm_mask_div_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_maskz_div_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_div_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round + return _mm_maskz_div_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_div_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_div_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_mask_div_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_div_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_maskz_div_sh(__U, __A, __B); +} + +__m128h test_mm_div_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_div_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + return _mm_div_sh(__A, __B); +} + +__m128h test_mm_min_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_min_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_min_round_sh(__A, __B, 0x08); +} +__m128h test_mm_mask_min_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_min_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_mask_min_round_sh(__W, __U, __A, __B, 0x08); +} +__m128h test_mm_maskz_min_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_min_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_maskz_min_round_sh(__U, __A, __B, 0x08); +} +__m128h test_mm_mask_min_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_min_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_mask_min_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_min_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_min_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_maskz_min_sh(__U, __A, __B); +} + +__m128h test_mm_min_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_min_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_min_sh(__A, __B); +} + +__m128h test_mm_max_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_max_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_max_round_sh(__A, __B, 0x08); +} +__m128h test_mm_mask_max_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_max_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_mask_max_round_sh(__W, __U, __A, __B, 0x08); +} +__m128h test_mm_maskz_max_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_max_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_maskz_max_round_sh(__U, __A, __B, 0x08); +} +__m128h test_mm_mask_max_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_max_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_mask_max_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_max_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_max_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_maskz_max_sh(__U, __A, __B); +} + +__m128h test_mm_max_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_max_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_max_sh(__A, __B); +} +__mmask32 test_mm512_cmp_round_ph_mask(__m512h a, __m512h b) { + // CHECK-LABEL: @test_mm512_cmp_round_ph_mask + // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_round_ph_mask(a, b, 0, _MM_FROUND_NO_EXC); +} + +__mmask32 test_mm512_mask_cmp_round_ph_mask(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: @test_mm512_mask_cmp_round_ph_mask + // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_round_ph_mask(m, a, b, 0, _MM_FROUND_NO_EXC); +} + +__mmask32 test_mm512_cmp_ph_mask_eq_oq(__m512h a, __m512h b) { + // CHECK-LABEL: @test_mm512_cmp_ph_mask_eq_oq + // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_lt_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_lt_os + // CHECK: fcmp olt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_LT_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_le_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_le_os + // CHECK: fcmp ole <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_LE_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_unord_q(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_unord_q + // CHECK: fcmp uno <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_UNORD_Q); +} + +__mmask32 test_mm512_cmp_ph_mask_neq_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_uq + // CHECK: fcmp une <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_nlt_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nlt_us + // CHECK: fcmp uge <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NLT_US); +} + +__mmask32 test_mm512_cmp_ph_mask_nle_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nle_us + // CHECK: fcmp ugt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NLE_US); +} + +__mmask32 test_mm512_cmp_ph_mask_ord_q(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ord_q + // CHECK: fcmp ord <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_ORD_Q); +} + +__mmask32 test_mm512_cmp_ph_mask_eq_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_uq + // CHECK: fcmp ueq <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_EQ_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_nge_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nge_us + // CHECK: fcmp ult <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NGE_US); +} + +__mmask32 test_mm512_cmp_ph_mask_ngt_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ngt_us + // CHECK: fcmp ule <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NGT_US); +} + +__mmask32 test_mm512_cmp_ph_mask_false_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_false_oq + // CHECK: fcmp false <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_FALSE_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_neq_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_oq + // CHECK: fcmp one <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_ge_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ge_os + // CHECK: fcmp oge <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_GE_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_gt_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_gt_os + // CHECK: fcmp ogt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_GT_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_true_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_true_uq + // CHECK: fcmp true <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_TRUE_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_eq_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_os + // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_EQ_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_lt_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_lt_oq + // CHECK: fcmp olt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_le_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_le_oq + // CHECK: fcmp ole <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_unord_s(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_unord_s + // CHECK: fcmp uno <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_UNORD_S); +} + +__mmask32 test_mm512_cmp_ph_mask_neq_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_us + // CHECK: fcmp une <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_US); +} + +__mmask32 test_mm512_cmp_ph_mask_nlt_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nlt_uq + // CHECK: fcmp uge <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NLT_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_nle_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nle_uq + // CHECK: fcmp ugt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NLE_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_ord_s(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ord_s + // CHECK: fcmp ord <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_ORD_S); +} + +__mmask32 test_mm512_cmp_ph_mask_eq_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_us + // CHECK: fcmp ueq <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_EQ_US); +} + +__mmask32 test_mm512_cmp_ph_mask_nge_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nge_uq + // CHECK: fcmp ult <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_ngt_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ngt_uq + // CHECK: fcmp ule <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NGT_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_false_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_false_os + // CHECK: fcmp false <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_FALSE_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_neq_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_os + // CHECK: fcmp one <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_ge_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ge_oq + // CHECK: fcmp oge <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_GE_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_gt_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_gt_oq + // CHECK: fcmp ogt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_GT_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_true_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_true_us + // CHECK: fcmp true <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_TRUE_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_eq_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: @test_mm512_mask_cmp_ph_mask_eq_oq + // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_lt_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_lt_os + // CHECK: [[CMP:%.*]] = fcmp olt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LT_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_le_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_le_os + // CHECK: [[CMP:%.*]] = fcmp ole <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LE_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_unord_q(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_unord_q + // CHECK: [[CMP:%.*]] = fcmp uno <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_Q); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_neq_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_uq + // CHECK: [[CMP:%.*]] = fcmp une <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nlt_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nlt_us + // CHECK: [[CMP:%.*]] = fcmp uge <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLT_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nle_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nle_us + // CHECK: [[CMP:%.*]] = fcmp ugt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLE_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ord_q(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ord_q + // CHECK: [[CMP:%.*]] = fcmp ord <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_ORD_Q); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_eq_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_uq + // CHECK: [[CMP:%.*]] = fcmp ueq <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nge_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nge_us + // CHECK: [[CMP:%.*]] = fcmp ult <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGE_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ngt_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ngt_us + // CHECK: [[CMP:%.*]] = fcmp ule <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGT_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_false_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_false_oq + // CHECK: [[CMP:%.*]] = fcmp false <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_neq_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_oq + // CHECK: [[CMP:%.*]] = fcmp one <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ge_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ge_os + // CHECK: [[CMP:%.*]] = fcmp oge <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GE_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_gt_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_gt_os + // CHECK: [[CMP:%.*]] = fcmp ogt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GT_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_true_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_true_uq + // CHECK: [[CMP:%.*]] = fcmp true <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_eq_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_os + // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_lt_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_lt_oq + // CHECK: [[CMP:%.*]] = fcmp olt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LT_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_le_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_le_oq + // CHECK: [[CMP:%.*]] = fcmp ole <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LE_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_unord_s(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_unord_s + // CHECK: [[CMP:%.*]] = fcmp uno <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_S); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_neq_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_us + // CHECK: [[CMP:%.*]] = fcmp une <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nlt_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nlt_uq + // CHECK: [[CMP:%.*]] = fcmp uge <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLT_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nle_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nle_uq + // CHECK: [[CMP:%.*]] = fcmp ugt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLE_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ord_s(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ord_s + // CHECK: [[CMP:%.*]] = fcmp ord <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_ORD_S); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_eq_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_us + // CHECK: [[CMP:%.*]] = fcmp ueq <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nge_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nge_uq + // CHECK: [[CMP:%.*]] = fcmp ult <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGE_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ngt_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ngt_uq + // CHECK: [[CMP:%.*]] = fcmp ule <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGT_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_false_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_false_os + // CHECK: [[CMP:%.*]] = fcmp false <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_neq_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_os + // CHECK: [[CMP:%.*]] = fcmp one <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ge_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ge_oq + // CHECK: [[CMP:%.*]] = fcmp oge <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GE_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_gt_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_gt_oq + // CHECK: [[CMP:%.*]] = fcmp ogt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GT_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_true_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_true_us + // CHECK: [[CMP:%.*]] = fcmp true <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US); +} + +__mmask8 test_mm_cmp_round_sh_mask(__m128h __X, __m128h __Y) { + // CHECK-LABEL: @test_mm_cmp_round_sh_mask + // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh + return _mm_cmp_round_sh_mask(__X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC); +} + +__mmask8 test_mm_mask_cmp_round_sh_mask(__mmask8 __M, __m128h __X, __m128h __Y) { + // CHECK-LABEL: @test_mm_mask_cmp_round_sh_mask + // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh + return _mm_mask_cmp_round_sh_mask(__M, __X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC); +} + +__mmask8 test_mm_cmp_sh_mask(__m128h __X, __m128h __Y) { + // CHECK-LABEL: @test_mm_cmp_sh_mask + // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh + return _mm_cmp_sh_mask(__X, __Y, _CMP_NLT_US); +} + +__mmask8 test_mm_mask_cmp_sh_mask(__mmask8 __M, __m128h __X, __m128h __Y) { + // CHECK-LABEL: @test_mm_mask_cmp_sh_mask + // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh + return _mm_mask_cmp_sh_mask(__M, __X, __Y, _CMP_NLT_US); +} + // VMOVSH __m128h test_mm_load_sh(void const *A) { @@ -499,6 +1542,30 @@ return _mm_cvtsi16_si128(A); } +_Float16 test_mm512_reduce_add_ph(__m512h __W) { + // CHECK-LABEL: @test_mm512_reduce_add_ph + // CHECK: call reassoc half @llvm.vector.reduce.fadd.v32f16(half 0xH8000, <32 x half> %{{.*}}) + return _mm512_reduce_add_ph(__W); +} + +_Float16 test_mm512_reduce_mul_ph(__m512h __W) { + // CHECK-LABEL: @test_mm512_reduce_mul_ph + // CHECK: call reassoc half @llvm.vector.reduce.fmul.v32f16(half 0xH3C00, <32 x half> %{{.*}}) + return _mm512_reduce_mul_ph(__W); +} + +_Float16 test_mm512_reduce_max_ph(__m512h __W) { + // CHECK-LABEL: @test_mm512_reduce_max_ph + // CHECK: call nnan half @llvm.vector.reduce.fmax.v32f16(<32 x half> %{{.*}}) + return _mm512_reduce_max_ph(__W); +} + +_Float16 test_mm512_reduce_min_ph(__m512h __W) { + // CHECK-LABEL: @test_mm512_reduce_min_ph + // CHECK: call nnan half @llvm.vector.reduce.fmin.v32f16(<32 x half> %{{.*}}) + return _mm512_reduce_min_ph(__W); +} + __m512h test_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { // CHECK-LABEL: @test_mm512_mask_blend_ph // CHECK: %{{.*}} = bitcast i32 %{{.*}} to <32 x i1> diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c --- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c @@ -139,6 +139,238 @@ __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16); } +__m256h test_mm256_add_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_add_ph + // CHECK: %{{.*}} = fadd <16 x half> %{{.*}}, %{{.*}} + return _mm256_add_ph(__A, __B); +} + +__m256h test_mm256_mask_add_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_add_ph + // CHECK: %{{.*}} = fadd <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return (__m256h)_mm256_mask_add_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_add_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_add_ph + // CHECK: %{{.*}} = fadd <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_add_ph(__U, __A, __B); +} + +__m128h test_mm_add_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_add_ph + // CHECK: %{{.*}} = fadd <8 x half> %{{.*}}, %{{.*}} + return _mm_add_ph(__A, __B); +} + +__m128h test_mm_mask_add_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_add_ph + // CHECK: %{{.*}} = fadd <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return (__m128h)_mm_mask_add_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_add_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_add_ph + // CHECK: %{{.*}} = fadd <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_add_ph(__U, __A, __B); +} + +__m256h test_mm256_sub_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_sub_ph + // CHECK: %{{.*}} = fsub <16 x half> %{{.*}}, %{{.*}} + return _mm256_sub_ph(__A, __B); +} + +__m256h test_mm256_mask_sub_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_sub_ph + // CHECK: %{{.*}} = fsub <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return (__m256h)_mm256_mask_sub_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_sub_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_sub_ph + // CHECK: %{{.*}} = fsub <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_sub_ph(__U, __A, __B); +} + +__m128h test_mm_sub_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_sub_ph + // CHECK: %{{.*}} = fsub <8 x half> %{{.*}}, %{{.*}} + return _mm_sub_ph(__A, __B); +} + +__m128h test_mm_mask_sub_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_sub_ph + // CHECK: %{{.*}} = fsub <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return (__m128h)_mm_mask_sub_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_sub_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_sub_ph + // CHECK: %{{.*}} = fsub <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_sub_ph(__U, __A, __B); +} + +__m256h test_mm256_mul_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mul_ph + // CHECK: %{{.*}} = fmul <16 x half> %{{.*}}, %{{.*}} + return _mm256_mul_ph(__A, __B); +} + +__m256h test_mm256_mask_mul_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_mul_ph + // CHECK: %{{.*}} = fmul <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return (__m256h)_mm256_mask_mul_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_mul_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_mul_ph + // CHECK: %{{.*}} = fmul <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_mul_ph(__U, __A, __B); +} + +__m128h test_mm_mul_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mul_ph + // CHECK: %{{.*}} = fmul <8 x half> %{{.*}}, %{{.*}} + return _mm_mul_ph(__A, __B); +} + +__m128h test_mm_mask_mul_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_mul_ph + // CHECK: %{{.*}} = fmul <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return (__m128h)_mm_mask_mul_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_mul_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_mul_ph + // CHECK: %{{.*}} = fmul <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_mul_ph(__U, __A, __B); +} + +__m256h test_mm256_div_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_div_ph + // CHECK: %{{.*}} = fdiv <16 x half> %{{.*}}, %{{.*}} + return _mm256_div_ph(__A, __B); +} + +__m256h test_mm256_mask_div_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_div_ph + // CHECK: %{{.*}} = fdiv <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return (__m256h)_mm256_mask_div_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_div_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_div_ph + // CHECK: %{{.*}} = fdiv <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_div_ph(__U, __A, __B); +} + +__m128h test_mm_div_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_div_ph + // CHECK: %{{.*}} = fdiv <8 x half> %{{.*}}, %{{.*}} + return _mm_div_ph(__A, __B); +} + +__m128h test_mm_mask_div_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_div_ph + // CHECK: %{{.*}} = fdiv <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return (__m128h)_mm_mask_div_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_div_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_div_ph + // CHECK: %{{.*}} = fdiv <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_div_ph(__U, __A, __B); +} + +__m256h test_mm256_min_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.256 + return _mm256_min_ph(__A, __B); +} + +__m256h test_mm256_mask_min_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.256 + return (__m256h)_mm256_mask_min_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_min_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.256 + return _mm256_maskz_min_ph(__U, __A, __B); +} + +__m128h test_mm_min_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.128 + return _mm_min_ph(__A, __B); +} + +__m128h test_mm_mask_min_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.128 + return (__m128h)_mm_mask_min_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_min_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.128 + return _mm_maskz_min_ph(__U, __A, __B); +} + +__m256h test_mm256_max_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.256 + return _mm256_max_ph(__A, __B); +} + +__m256h test_mm256_mask_max_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.256 + return (__m256h)_mm256_mask_max_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_max_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.256 + return _mm256_maskz_max_ph(__U, __A, __B); +} + +__m128h test_mm_max_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.128 + return _mm_max_ph(__A, __B); +} + +__m128h test_mm_mask_max_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.128 + return (__m128h)_mm_mask_max_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_max_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.128 + return _mm_maskz_max_ph(__U, __A, __B); +} + __m128h test_mm_abs_ph(__m128h a) { // CHECK-LABEL: @test_mm_abs_ph // CHECK: and <4 x i32> @@ -151,6 +383,838 @@ return _mm256_abs_ph(a); } +__mmask16 test_mm256_cmp_ph_mask_eq_oq(__m256h a, __m256h b) { + // CHECK-LABEL: @test_mm256_cmp_ph_mask_eq_oq + // CHECK: fcmp oeq <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_EQ_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_lt_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_lt_os + // CHECK: fcmp olt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_LT_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_le_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_le_os + // CHECK: fcmp ole <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_LE_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_unord_q(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_unord_q + // CHECK: fcmp uno <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_UNORD_Q); +} + +__mmask16 test_mm256_cmp_ph_mask_neq_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_uq + // CHECK: fcmp une <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_nlt_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nlt_us + // CHECK: fcmp uge <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NLT_US); +} + +__mmask16 test_mm256_cmp_ph_mask_nle_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nle_us + // CHECK: fcmp ugt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NLE_US); +} + +__mmask16 test_mm256_cmp_ph_mask_ord_q(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ord_q + // CHECK: fcmp ord <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_ORD_Q); +} + +__mmask16 test_mm256_cmp_ph_mask_eq_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_eq_uq + // CHECK: fcmp ueq <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_EQ_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_nge_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nge_us + // CHECK: fcmp ult <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NGE_US); +} + +__mmask16 test_mm256_cmp_ph_mask_ngt_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ngt_us + // CHECK: fcmp ule <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NGT_US); +} + +__mmask16 test_mm256_cmp_ph_mask_false_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_false_oq + // CHECK: fcmp false <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_FALSE_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_neq_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_oq + // CHECK: fcmp one <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_ge_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ge_os + // CHECK: fcmp oge <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_GE_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_gt_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_gt_os + // CHECK: fcmp ogt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_GT_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_true_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_true_uq + // CHECK: fcmp true <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_TRUE_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_eq_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_eq_os + // CHECK: fcmp oeq <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_EQ_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_lt_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_lt_oq + // CHECK: fcmp olt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_LT_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_le_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_le_oq + // CHECK: fcmp ole <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_LE_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_unord_s(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_unord_s + // CHECK: fcmp uno <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_UNORD_S); +} + +__mmask16 test_mm256_cmp_ph_mask_neq_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_us + // CHECK: fcmp une <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_US); +} + +__mmask16 test_mm256_cmp_ph_mask_nlt_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nlt_uq + // CHECK: fcmp uge <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NLT_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_nle_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nle_uq + // CHECK: fcmp ugt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NLE_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_ord_s(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ord_s + // CHECK: fcmp ord <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_ORD_S); +} + +__mmask16 test_mm256_cmp_ph_mask_eq_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_eq_us + // CHECK: fcmp ueq <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_EQ_US); +} + +__mmask16 test_mm256_cmp_ph_mask_nge_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nge_uq + // CHECK: fcmp ult <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NGE_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_ngt_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ngt_uq + // CHECK: fcmp ule <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NGT_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_false_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_false_os + // CHECK: fcmp false <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_FALSE_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_neq_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_os + // CHECK: fcmp one <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_ge_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ge_oq + // CHECK: fcmp oge <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_GE_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_gt_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_gt_oq + // CHECK: fcmp ogt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_GT_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_true_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_true_us + // CHECK: fcmp true <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_TRUE_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_eq_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: @test_mm256_mask_cmp_ph_mask_eq_oq + // CHECK: [[CMP:%.*]] = fcmp oeq <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_lt_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_lt_os + // CHECK: [[CMP:%.*]] = fcmp olt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LT_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_le_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_le_os + // CHECK: [[CMP:%.*]] = fcmp ole <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LE_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_unord_q(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_unord_q + // CHECK: [[CMP:%.*]] = fcmp uno <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_Q); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_neq_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_uq + // CHECK: [[CMP:%.*]] = fcmp une <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nlt_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nlt_us + // CHECK: [[CMP:%.*]] = fcmp uge <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLT_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nle_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nle_us + // CHECK: [[CMP:%.*]] = fcmp ugt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLE_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ord_q(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ord_q + // CHECK: [[CMP:%.*]] = fcmp ord <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_ORD_Q); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_eq_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_eq_uq + // CHECK: [[CMP:%.*]] = fcmp ueq <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nge_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nge_us + // CHECK: [[CMP:%.*]] = fcmp ult <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGE_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ngt_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ngt_us + // CHECK: [[CMP:%.*]] = fcmp ule <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGT_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_false_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_false_oq + // CHECK: [[CMP:%.*]] = fcmp false <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_neq_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_oq + // CHECK: [[CMP:%.*]] = fcmp one <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ge_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ge_os + // CHECK: [[CMP:%.*]] = fcmp oge <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GE_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_gt_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_gt_os + // CHECK: [[CMP:%.*]] = fcmp ogt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GT_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_true_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_true_uq + // CHECK: [[CMP:%.*]] = fcmp true <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_eq_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_eq_os + // CHECK: [[CMP:%.*]] = fcmp oeq <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_lt_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_lt_oq + // CHECK: [[CMP:%.*]] = fcmp olt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LT_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_le_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_le_oq + // CHECK: [[CMP:%.*]] = fcmp ole <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LE_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_unord_s(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_unord_s + // CHECK: [[CMP:%.*]] = fcmp uno <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_S); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_neq_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_us + // CHECK: [[CMP:%.*]] = fcmp une <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nlt_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nlt_uq + // CHECK: [[CMP:%.*]] = fcmp uge <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLT_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nle_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nle_uq + // CHECK: [[CMP:%.*]] = fcmp ugt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLE_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ord_s(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ord_s + // CHECK: [[CMP:%.*]] = fcmp ord <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_ORD_S); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_eq_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_eq_us + // CHECK: [[CMP:%.*]] = fcmp ueq <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nge_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nge_uq + // CHECK: [[CMP:%.*]] = fcmp ult <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGE_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ngt_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ngt_uq + // CHECK: [[CMP:%.*]] = fcmp ule <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGT_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_false_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_false_os + // CHECK: [[CMP:%.*]] = fcmp false <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_neq_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_os + // CHECK: [[CMP:%.*]] = fcmp one <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ge_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ge_oq + // CHECK: [[CMP:%.*]] = fcmp oge <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GE_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_gt_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_gt_oq + // CHECK: [[CMP:%.*]] = fcmp ogt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GT_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_true_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_true_us + // CHECK: [[CMP:%.*]] = fcmp true <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US); +} + +__mmask8 test_mm_cmp_ph_mask_eq_oq(__m128h a, __m128h b) { + // CHECK-LABEL: @test_mm_cmp_ph_mask_eq_oq + // CHECK: fcmp oeq <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_EQ_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_lt_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_lt_os + // CHECK: fcmp olt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_LT_OS); +} + +__mmask8 test_mm_cmp_ph_mask_le_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_le_os + // CHECK: fcmp ole <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_LE_OS); +} + +__mmask8 test_mm_cmp_ph_mask_unord_q(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_unord_q + // CHECK: fcmp uno <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_UNORD_Q); +} + +__mmask8 test_mm_cmp_ph_mask_neq_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_neq_uq + // CHECK: fcmp une <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NEQ_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_nlt_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nlt_us + // CHECK: fcmp uge <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NLT_US); +} + +__mmask8 test_mm_cmp_ph_mask_nle_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nle_us + // CHECK: fcmp ugt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NLE_US); +} + +__mmask8 test_mm_cmp_ph_mask_ord_q(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ord_q + // CHECK: fcmp ord <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_ORD_Q); +} + +__mmask8 test_mm_cmp_ph_mask_eq_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_eq_uq + // CHECK: fcmp ueq <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_EQ_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_nge_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nge_us + // CHECK: fcmp ult <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NGE_US); +} + +__mmask8 test_mm_cmp_ph_mask_ngt_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ngt_us + // CHECK: fcmp ule <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NGT_US); +} + +__mmask8 test_mm_cmp_ph_mask_false_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_false_oq + // CHECK: fcmp false <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_FALSE_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_neq_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_neq_oq + // CHECK: fcmp one <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NEQ_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_ge_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ge_os + // CHECK: fcmp oge <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_GE_OS); +} + +__mmask8 test_mm_cmp_ph_mask_gt_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_gt_os + // CHECK: fcmp ogt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_GT_OS); +} + +__mmask8 test_mm_cmp_ph_mask_true_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_true_uq + // CHECK: fcmp true <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_TRUE_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_eq_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_eq_os + // CHECK: fcmp oeq <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_EQ_OS); +} + +__mmask8 test_mm_cmp_ph_mask_lt_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_lt_oq + // CHECK: fcmp olt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_LT_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_le_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_le_oq + // CHECK: fcmp ole <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_LE_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_unord_s(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_unord_s + // CHECK: fcmp uno <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_UNORD_S); +} + +__mmask8 test_mm_cmp_ph_mask_neq_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_neq_us + // CHECK: fcmp une <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NEQ_US); +} + +__mmask8 test_mm_cmp_ph_mask_nlt_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nlt_uq + // CHECK: fcmp uge <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NLT_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_nle_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nle_uq + // CHECK: fcmp ugt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NLE_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_ord_s(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ord_s + // CHECK: fcmp ord <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_ORD_S); +} + +__mmask8 test_mm_cmp_ph_mask_eq_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_eq_us + // CHECK: fcmp ueq <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_EQ_US); +} + +__mmask8 test_mm_cmp_ph_mask_nge_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nge_uq + // CHECK: fcmp ult <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NGE_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_ngt_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ngt_uq + // CHECK: fcmp ule <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NGT_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_false_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_false_os + // CHECK: fcmp false <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_FALSE_OS); +} + +__mmask8 test_mm_cmp_ph_mask_neq_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_neq_os + // CHECK: fcmp one <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NEQ_OS); +} + +__mmask8 test_mm_cmp_ph_mask_ge_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ge_oq + // CHECK: fcmp oge <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_GE_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_gt_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_gt_oq + // CHECK: fcmp ogt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_GT_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_true_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_true_us + // CHECK: fcmp true <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_TRUE_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_eq_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: @test_mm_mask_cmp_ph_mask_eq_oq + // CHECK: [[CMP:%.*]] = fcmp oeq <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_lt_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_lt_os + // CHECK: [[CMP:%.*]] = fcmp olt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LT_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_le_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_le_os + // CHECK: [[CMP:%.*]] = fcmp ole <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LE_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_unord_q(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_unord_q + // CHECK: [[CMP:%.*]] = fcmp uno <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_Q); +} + +__mmask8 test_mm_mask_cmp_ph_mask_neq_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_uq + // CHECK: [[CMP:%.*]] = fcmp une <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nlt_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nlt_us + // CHECK: [[CMP:%.*]] = fcmp uge <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLT_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nle_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nle_us + // CHECK: [[CMP:%.*]] = fcmp ugt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLE_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ord_q(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ord_q + // CHECK: [[CMP:%.*]] = fcmp ord <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_ORD_Q); +} + +__mmask8 test_mm_mask_cmp_ph_mask_eq_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_eq_uq + // CHECK: [[CMP:%.*]] = fcmp ueq <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nge_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nge_us + // CHECK: [[CMP:%.*]] = fcmp ult <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGE_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ngt_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ngt_us + // CHECK: [[CMP:%.*]] = fcmp ule <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGT_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_false_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_false_oq + // CHECK: [[CMP:%.*]] = fcmp false <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_neq_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_oq + // CHECK: [[CMP:%.*]] = fcmp one <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ge_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ge_os + // CHECK: [[CMP:%.*]] = fcmp oge <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GE_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_gt_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_gt_os + // CHECK: [[CMP:%.*]] = fcmp ogt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GT_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_true_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_true_uq + // CHECK: [[CMP:%.*]] = fcmp true <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_eq_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_eq_os + // CHECK: [[CMP:%.*]] = fcmp oeq <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_lt_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_lt_oq + // CHECK: [[CMP:%.*]] = fcmp olt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LT_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_le_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_le_oq + // CHECK: [[CMP:%.*]] = fcmp ole <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LE_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_unord_s(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_unord_s + // CHECK: [[CMP:%.*]] = fcmp uno <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_S); +} + +__mmask8 test_mm_mask_cmp_ph_mask_neq_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_us + // CHECK: [[CMP:%.*]] = fcmp une <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nlt_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nlt_uq + // CHECK: [[CMP:%.*]] = fcmp uge <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLT_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nle_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nle_uq + // CHECK: [[CMP:%.*]] = fcmp ugt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLE_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ord_s(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ord_s + // CHECK: [[CMP:%.*]] = fcmp ord <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_ORD_S); +} + +__mmask8 test_mm_mask_cmp_ph_mask_eq_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_eq_us + // CHECK: [[CMP:%.*]] = fcmp ueq <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nge_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nge_uq + // CHECK: [[CMP:%.*]] = fcmp ult <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGE_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ngt_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ngt_uq + // CHECK: [[CMP:%.*]] = fcmp ule <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGT_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_false_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_false_os + // CHECK: [[CMP:%.*]] = fcmp false <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_neq_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_os + // CHECK: [[CMP:%.*]] = fcmp one <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ge_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ge_oq + // CHECK: [[CMP:%.*]] = fcmp oge <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GE_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_gt_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_gt_oq + // CHECK: [[CMP:%.*]] = fcmp ogt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GT_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_true_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_true_us + // CHECK: [[CMP:%.*]] = fcmp true <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US); +} + __m128h test_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { // CHECK-LABEL: @test_mm_mask_blend_ph // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> @@ -202,3 +1266,51 @@ // CHECK: %{{.*}} = bitcast <16 x i16> %{{.*}} to <16 x half> return _mm256_permutexvar_ph(__A, __B); } + +_Float16 test_mm256_reduce_add_ph(__m256h __W) { + // CHECK-LABEL: @test_mm256_reduce_add_ph + // CHECK: call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> %{{.*}}) + return _mm256_reduce_add_ph(__W); +} + +_Float16 test_mm256_reduce_mul_ph(__m256h __W) { + // CHECK-LABEL: @test_mm256_reduce_mul_ph + // CHECK: call reassoc half @llvm.vector.reduce.fmul.v16f16(half 0xH3C00, <16 x half> %{{.*}}) + return _mm256_reduce_mul_ph(__W); +} + +_Float16 test_mm256_reduce_max_ph(__m256h __W) { + // CHECK-LABEL: @test_mm256_reduce_max_ph + // CHECK: call nnan half @llvm.vector.reduce.fmax.v16f16(<16 x half> %{{.*}}) + return _mm256_reduce_max_ph(__W); +} + +_Float16 test_mm256_reduce_min_ph(__m256h __W) { + // CHECK-LABEL: @test_mm256_reduce_min_ph + // CHECK: call nnan half @llvm.vector.reduce.fmin.v16f16(<16 x half> %{{.*}}) + return _mm256_reduce_min_ph(__W); +} + +_Float16 test_mm_reduce_add_ph(__m128h __W) { + // CHECK-LABEL: @test_mm_reduce_add_ph + // CHECK: call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> %{{.*}}) + return _mm_reduce_add_ph(__W); +} + +_Float16 test_mm_reduce_mul_ph(__m128h __W) { + // CHECK-LABEL: @test_mm_reduce_mul_ph + // CHECK: call reassoc half @llvm.vector.reduce.fmul.v8f16(half 0xH3C00, <8 x half> %{{.*}}) + return _mm_reduce_mul_ph(__W); +} + +_Float16 test_mm_reduce_min_ph(__m128h __W) { + // CHECK-LABEL: @test_mm_reduce_min_ph + // CHECK: call nnan half @llvm.vector.reduce.fmin.v8f16(<8 x half> %{{.*}}) + return _mm_reduce_min_ph(__W); +} + +_Float16 test_mm_reduce_max_ph(__m128h __W) { + // CHECK-LABEL: @test_mm_reduce_max_ph + // CHECK: call nnan half @llvm.vector.reduce.fmax.v8f16(<8 x half> %{{.*}}) + return _mm_reduce_max_ph(__W); +} diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5108,3 +5108,116 @@ def int_x86_senduipi : GCCBuiltin<"__builtin_ia32_senduipi">, Intrinsic<[], [llvm_i64_ty], []>; } + +//===----------------------------------------------------------------------===// +// avx512_fp16: vaddph +let TargetPrefix = "x86" in { + def int_x86_avx512fp16_add_ph_512 + : GCCBuiltin<"__builtin_ia32_addph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_sub_ph_512 + : GCCBuiltin<"__builtin_ia32_subph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mul_ph_512 + : GCCBuiltin<"__builtin_ia32_mulph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_div_ph_512 + : GCCBuiltin<"__builtin_ia32_divph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_max_ph_128 + : GCCBuiltin<"__builtin_ia32_maxph128">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_max_ph_256 + : GCCBuiltin<"__builtin_ia32_maxph256">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_v16f16_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_max_ph_512 + : GCCBuiltin<"__builtin_ia32_maxph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_min_ph_128 + : GCCBuiltin<"__builtin_ia32_minph128">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_min_ph_256 + : GCCBuiltin<"__builtin_ia32_minph256">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_v16f16_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_min_ph_512 + : GCCBuiltin<"__builtin_ia32_minph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + + def int_x86_avx512fp16_mask_cmp_ph_512 + : Intrinsic<[ llvm_v32i1_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_v32i1_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; + def int_x86_avx512fp16_mask_cmp_ph_256 + : Intrinsic<[ llvm_v16i1_ty ], + [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i32_ty, llvm_v16i1_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_cmp_ph_128 + : Intrinsic<[ llvm_v8i1_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_v8i1_ty ], + [ IntrNoMem, ImmArg> ]>; + + def int_x86_avx512fp16_mask_add_sh_round + : GCCBuiltin<"__builtin_ia32_addsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_sub_sh_round + : GCCBuiltin<"__builtin_ia32_subsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_mul_sh_round + : GCCBuiltin<"__builtin_ia32_mulsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_div_sh_round + : GCCBuiltin<"__builtin_ia32_divsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_min_sh_round + : GCCBuiltin<"__builtin_ia32_minsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_max_sh_round + : GCCBuiltin<"__builtin_ia32_maxsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_cmp_sh + : GCCBuiltin<"__builtin_ia32_cmpsh_mask">, + Intrinsic<[ llvm_i8_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; + def int_x86_avx512fp16_vcomi_sh + : GCCBuiltin<"__builtin_ia32_vcomish">, + Intrinsic<[ llvm_i32_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; +} diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -3127,9 +3127,10 @@ unsigned ComparisonPredicate = ~0U; - // FIXME: Hack to recognize cmp{ss,sd,ps,pd}. + // FIXME: Hack to recognize cmp{sh,ss,sd,ph,ps,pd}. if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && (PatchedName.endswith("ss") || PatchedName.endswith("sd") || + PatchedName.endswith("sh") || PatchedName.endswith("ph") || PatchedName.endswith("ps") || PatchedName.endswith("pd"))) { bool IsVCMP = PatchedName[0] == 'v'; unsigned CCIdx = IsVCMP ? 4 : 3; @@ -3192,6 +3193,10 @@ PatchedName = IsVCMP ? "vcmpps" : "cmpps"; else if (PatchedName.endswith("pd")) PatchedName = IsVCMP ? "vcmppd" : "cmppd"; + else if (PatchedName.endswith("sh")) + PatchedName = "vcmpsh"; + else if (PatchedName.endswith("ph")) + PatchedName = "vcmpph"; else llvm_unreachable("Unexpected suffix!"); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp --- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -153,6 +153,20 @@ case X86::VCMPPSZrrib: case X86::VCMPPSZrribk: case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk: case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk: + case X86::VCMPPHZ128rmi: case X86::VCMPPHZ128rri: + case X86::VCMPPHZ256rmi: case X86::VCMPPHZ256rri: + case X86::VCMPPHZrmi: case X86::VCMPPHZrri: + case X86::VCMPSHZrm: case X86::VCMPSHZrr: + case X86::VCMPSHZrm_Int: case X86::VCMPSHZrr_Int: + case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik: + case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik: + case X86::VCMPPHZrmik: case X86::VCMPPHZrrik: + case X86::VCMPSHZrm_Intk: case X86::VCMPSHZrr_Intk: + case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik: + case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik: + case X86::VCMPPHZrmbi: case X86::VCMPPHZrmbik: + case X86::VCMPPHZrrib: case X86::VCMPPHZrribk: + case X86::VCMPSHZrrb_Int: case X86::VCMPSHZrrb_Intk: if (Imm >= 0 && Imm <= 31) { OS << '\t'; printCMPMnemonic(MI, /*IsVCMP*/true, OS); @@ -176,6 +190,8 @@ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; else NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; + if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) + NumElts *= 2; OS << "{1to" << NumElts << "}"; } else { if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -264,6 +264,24 @@ case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk: OS << "ss\t"; break; + case X86::VCMPPHZ128rmi: case X86::VCMPPHZ128rri: + case X86::VCMPPHZ256rmi: case X86::VCMPPHZ256rri: + case X86::VCMPPHZrmi: case X86::VCMPPHZrri: + case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik: + case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik: + case X86::VCMPPHZrmik: case X86::VCMPPHZrrik: + case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik: + case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik: + case X86::VCMPPHZrmbi: case X86::VCMPPHZrmbik: + case X86::VCMPPHZrrib: case X86::VCMPPHZrribk: + OS << "ph\t"; + break; + case X86::VCMPSHZrm: case X86::VCMPSHZrr: + case X86::VCMPSHZrm_Int: case X86::VCMPSHZrr_Int: + case X86::VCMPSHZrrb_Int: case X86::VCMPSHZrrb_Intk: + case X86::VCMPSHZrm_Intk: case X86::VCMPSHZrr_Intk: + OS << "sh\t"; + break; } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1900,6 +1900,15 @@ if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) { auto setGroup = [&] (MVT VT) { + setOperationAction(ISD::FADD, VT, Legal); + setOperationAction(ISD::STRICT_FADD, VT, Legal); + setOperationAction(ISD::FSUB, VT, Legal); + setOperationAction(ISD::STRICT_FSUB, VT, Legal); + setOperationAction(ISD::FMUL, VT, Legal); + setOperationAction(ISD::STRICT_FMUL, VT, Legal); + setOperationAction(ISD::FDIV, VT, Legal); + setOperationAction(ISD::STRICT_FDIV, VT, Legal); + setOperationAction(ISD::LOAD, VT, Legal); setOperationAction(ISD::STORE, VT, Legal); @@ -1917,6 +1926,14 @@ // AVX512_FP16 scalar operations setGroup(MVT::f16); addRegisterClass(MVT::f16, &X86::FR16XRegClass); + setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); + setOperationAction(ISD::BR_CC, MVT::f16, Expand); + setOperationAction(ISD::SETCC, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); + + setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand); + setCondCodeAction(ISD::SETUNE, MVT::f16, Expand); if (Subtarget.useAVX512Regs()) { setGroup(MVT::v32f16); @@ -1930,6 +1947,9 @@ setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal); + + setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom); } if (Subtarget.hasVLX()) { @@ -47951,6 +47971,7 @@ EVT VT = N->getValueType(0); if (!((Subtarget.hasSSE1() && VT == MVT::f32) || (Subtarget.hasSSE2() && VT == MVT::f64) || + (Subtarget.hasFP16() && VT == MVT::f16) || (VT.isVector() && TLI.isTypeLegal(VT)))) return SDValue(); @@ -48512,6 +48533,9 @@ SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64) return SDValue(); + // We don't have CMPP Instruction for vxf16 + if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16) + return SDValue(); // We can only do this if the vector size in 256 bits or less. unsigned Size = VT.getSizeInBits(); if (Size > 256 && Subtarget.useAVX512Regs()) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2169,6 +2169,10 @@ X86cmpms_su, X86cmpmsSAE_su, SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W; } +let Predicates = [HasFP16], ExeDomain = SSEPackedSingle in + defm VCMPSHZ : avx512_cmp_scalar, AVX512XSIi8Base, TA; multiclass avx512_icmp_packed opc, string OpcodeStr, X86FoldableSchedWrite sched, @@ -2631,13 +2635,14 @@ EVEX_B, Sched<[sched]>; } -multiclass avx512_vcmp { - let Predicates = [HasAVX512] in { +multiclass avx512_vcmp { + let Predicates = [Pred] in { defm Z : avx512_vcmp_common, avx512_vcmp_sae, EVEX_V512; } - let Predicates = [HasAVX512,HasVLX] in { + let Predicates = [Pred,HasVLX] in { defm Z128 : avx512_vcmp_common, EVEX_V128; defm Z256 : avx512_vcmp_common, EVEX_V256; } @@ -2659,6 +2664,13 @@ (VCMPSSZrm FR32X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; } +defm VCMPPH : avx512_vcmp, + AVX512PSIi8Base, EVEX_4V, EVEX_CD8<16, CD8VF>, TA; +let Predicates = [HasFP16] in { + def : Pat<(v1i1 (X86cmpms(loadf16 addr:$src2), FR16X:$src1, CommutableCMPCC:$cc)), + (VCMPSHZrm FR16X:$src1, addr:$src2, imm:$cc)>; +} + // ---------------------------------------------------------------- // FPClass @@ -4152,7 +4164,7 @@ VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info, - [HasFP16, OptForSize]>, + [HasFP16]>, VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>; multiclass avx512_move_scalar_lowering, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; + let Predicates = [HasFP16] in + defm SHZ : avx512_fp_scalar, + avx512_fp_scalar_round, + T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>; } multiclass avx512_binop_s_sae opc, string OpcodeStr, SDNode OpNode, @@ -5647,6 +5665,13 @@ VecNode, SaeNode, sched.PD.Scl, IsCommutable, NAME#"SD">, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; + let Predicates = [HasFP16] in { + defm SHZ : avx512_fp_scalar_sae, + T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, + NotEVEX2VEXConvertible; + } } defm VADD : avx512_binop_s_round<0x58, "vadd", any_fadd, X86fadds, X86faddRnds, SchedWriteFAddSizes, 1>; @@ -5702,6 +5727,15 @@ VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, SIMD_EXC; +defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc, + SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5XS, + EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC, + NotEVEX2VEXConvertible; +defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc, + SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5XS, + EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC, + NotEVEX2VEXConvertible; + multiclass avx512_fp_packed opc, string OpcodeStr, SDPatternOperator OpNode, SDPatternOperator MaskOpNode, X86VectorVTInfo _, X86FoldableSchedWrite sched, @@ -5789,9 +5823,33 @@ } } +multiclass avx512_fp_binop_ph opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, + X86SchedWriteSizes sched, bit IsCommutable = 0, + bit IsPD128Commutable = IsCommutable> { + let Predicates = [HasFP16] in { + defm PHZ : avx512_fp_packed, EVEX_V512, T_MAP5PS, + EVEX_CD8<16, CD8VF>; + } + let Predicates = [HasVLX, HasFP16] in { + defm PHZ128 : avx512_fp_packed, EVEX_V128, T_MAP5PS, + EVEX_CD8<16, CD8VF>; + defm PHZ256 : avx512_fp_packed, EVEX_V256, T_MAP5PS, + EVEX_CD8<16, CD8VF>; + } +} + let Uses = [MXCSR] in multiclass avx512_fp_binop_p_round opc, string OpcodeStr, SDNode OpNodeRnd, X86SchedWriteSizes sched> { + let Predicates = [HasFP16] in { + defm PHZ : avx512_fp_round_packed, + EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>; + } defm PSZ : avx512_fp_round_packed, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; @@ -5803,6 +5861,11 @@ let Uses = [MXCSR] in multiclass avx512_fp_binop_p_sae opc, string OpcodeStr, SDNode OpNodeRnd, X86SchedWriteSizes sched> { + let Predicates = [HasFP16] in { + defm PHZ : avx512_fp_sae_packed, + EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>; + } defm PSZ : avx512_fp_sae_packed, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; @@ -5813,26 +5876,36 @@ defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, fadd, HasAVX512, SchedWriteFAddSizes, 1>, + avx512_fp_binop_ph<0x58, "vadd", any_fadd, fadd, SchedWriteFAddSizes, 1>, avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>; defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, fmul, HasAVX512, SchedWriteFMulSizes, 1>, + avx512_fp_binop_ph<0x59, "vmul", any_fmul, fmul, SchedWriteFMulSizes, 1>, avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>; defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, fsub, HasAVX512, SchedWriteFAddSizes>, + avx512_fp_binop_ph<0x5C, "vsub", any_fsub, fsub, SchedWriteFAddSizes>, avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>; defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, fdiv, HasAVX512, SchedWriteFDivSizes>, + avx512_fp_binop_ph<0x5E, "vdiv", any_fdiv, fdiv, SchedWriteFDivSizes>, avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>; defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, X86fmin, HasAVX512, SchedWriteFCmpSizes, 0>, + avx512_fp_binop_ph<0x5D, "vmin", X86fmin, X86fmin, SchedWriteFCmpSizes, 0>, avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>; defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, X86fmax, HasAVX512, SchedWriteFCmpSizes, 0>, + avx512_fp_binop_ph<0x5F, "vmax", X86fmax, X86fmax, SchedWriteFCmpSizes, 0>, avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>; let isCodeGenOnly = 1 in { defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, X86fminc, HasAVX512, + SchedWriteFCmpSizes, 1>, + avx512_fp_binop_ph<0x5D, "vmin", X86fminc, X86fminc, SchedWriteFCmpSizes, 1>; defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, X86fmaxc, HasAVX512, + SchedWriteFCmpSizes, 1>, + avx512_fp_binop_ph<0x5F, "vmax", X86fmaxc, X86fmaxc, SchedWriteFCmpSizes, 1>; } let Uses = [], mayRaiseFPException = 0 in { @@ -8945,6 +9018,30 @@ } } +let Defs = [EFLAGS], Predicates = [HasFP16] in { + defm VUCOMISHZ : avx512_ord_cmp_sae<0x2E, v8f16x_info, "vucomish", + SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS, + EVEX_CD8<16, CD8VT1>; + defm VCOMISHZ : avx512_ord_cmp_sae<0x2F, v8f16x_info, "vcomish", + SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS, + EVEX_CD8<16, CD8VT1>; + defm VUCOMISHZ : sse12_ord_cmp<0x2E, FR16X, X86any_fcmp, f16, f16mem, loadf16, + "ucomish", SSEPackedSingle>, T_MAP5PS, EVEX, + VEX_LIG, EVEX_CD8<16, CD8VT1>; + defm VCOMISHZ : sse12_ord_cmp<0x2F, FR16X, X86strict_fcmps, f16, f16mem, loadf16, + "comish", SSEPackedSingle>, T_MAP5PS, EVEX, + VEX_LIG, EVEX_CD8<16, CD8VT1>; + let isCodeGenOnly = 1 in { + defm VUCOMISHZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v8f16, shmem, + sse_load_f16, "ucomish", SSEPackedSingle>, + T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>; + + defm VCOMISHZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v8f16, shmem, + sse_load_f16, "comish", SSEPackedSingle>, + T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>; + } +} + /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd multiclass avx512_fp14_s opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { @@ -11868,6 +11965,11 @@ defm : AVX512_scalar_math_fp_patterns; defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; + multiclass AVX512_scalar_unary_math_patterns { let Predicates = [HasAVX512] in { diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -1550,6 +1550,9 @@ { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 }, { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, { X86::VADDPDrr, X86::VADDPDrm, 0 }, + { X86::VADDPHZ128rr, X86::VADDPHZ128rm, 0 }, + { X86::VADDPHZ256rr, X86::VADDPHZ256rm, 0 }, + { X86::VADDPHZrr, X86::VADDPHZrm, 0 }, { X86::VADDPSYrr, X86::VADDPSYrm, 0 }, { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 }, { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 }, @@ -1559,6 +1562,8 @@ { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE }, + { X86::VADDSHZrr, X86::VADDSHZrm, 0 }, + { X86::VADDSHZrr_Int, X86::VADDSHZrm_Int, TB_NO_REVERSE }, { X86::VADDSSZrr, X86::VADDSSZrm, 0 }, { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE }, { X86::VADDSSrr, X86::VADDSSrm, 0 }, @@ -1642,6 +1647,9 @@ { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 }, { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 }, { X86::VCMPPDrri, X86::VCMPPDrmi, 0 }, + { X86::VCMPPHZ128rri, X86::VCMPPHZ128rmi, 0 }, + { X86::VCMPPHZ256rri, X86::VCMPPHZ256rmi, 0 }, + { X86::VCMPPHZrri, X86::VCMPPHZrmi, 0 }, { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 }, { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 }, { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 }, @@ -1651,6 +1659,8 @@ { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE }, { X86::VCMPSDrr, X86::VCMPSDrm, 0 }, { X86::VCMPSDrr_Int, X86::VCMPSDrm_Int, TB_NO_REVERSE }, + { X86::VCMPSHZrr, X86::VCMPSHZrm, 0 }, + { X86::VCMPSHZrr_Int, X86::VCMPSHZrm_Int, TB_NO_REVERSE }, { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 }, { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE }, { X86::VCMPSSrr, X86::VCMPSSrm, 0 }, @@ -1782,6 +1792,9 @@ { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 }, { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, + { X86::VDIVPHZ128rr, X86::VDIVPHZ128rm, 0 }, + { X86::VDIVPHZ256rr, X86::VDIVPHZ256rm, 0 }, + { X86::VDIVPHZrr, X86::VDIVPHZrm, 0 }, { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 }, { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 }, { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 }, @@ -1791,6 +1804,8 @@ { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE }, { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE }, + { X86::VDIVSHZrr, X86::VDIVSHZrm, 0 }, + { X86::VDIVSHZrr_Int, X86::VDIVSHZrm_Int, TB_NO_REVERSE }, { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 }, { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE }, { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, @@ -1912,6 +1927,9 @@ { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 }, { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 }, { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 }, + { X86::VMAXCPHZ128rr, X86::VMAXCPHZ128rm, 0 }, + { X86::VMAXCPHZ256rr, X86::VMAXCPHZ256rm, 0 }, + { X86::VMAXCPHZrr, X86::VMAXCPHZrm, 0 }, { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 }, { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 }, { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 }, @@ -1919,6 +1937,7 @@ { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 }, { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 }, { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 }, + { X86::VMAXCSHZrr, X86::VMAXCSHZrm, 0 }, { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 }, { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 }, { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 }, @@ -1926,6 +1945,9 @@ { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 }, { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 }, { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, + { X86::VMAXPHZ128rr, X86::VMAXPHZ128rm, 0 }, + { X86::VMAXPHZ256rr, X86::VMAXPHZ256rm, 0 }, + { X86::VMAXPHZrr, X86::VMAXPHZrm, 0 }, { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 }, { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 }, { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 }, @@ -1935,6 +1957,8 @@ { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE }, { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE }, + { X86::VMAXSHZrr, X86::VMAXSHZrm, 0 }, + { X86::VMAXSHZrr_Int, X86::VMAXSHZrm_Int, TB_NO_REVERSE }, { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 }, { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE }, { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, @@ -1944,6 +1968,9 @@ { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 }, { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 }, { X86::VMINCPDrr, X86::VMINCPDrm, 0 }, + { X86::VMINCPHZ128rr, X86::VMINCPHZ128rm, 0 }, + { X86::VMINCPHZ256rr, X86::VMINCPHZ256rm, 0 }, + { X86::VMINCPHZrr, X86::VMINCPHZrm, 0 }, { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 }, { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 }, { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 }, @@ -1951,6 +1978,7 @@ { X86::VMINCPSrr, X86::VMINCPSrm, 0 }, { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 }, { X86::VMINCSDrr, X86::VMINCSDrm, 0 }, + { X86::VMINCSHZrr, X86::VMINCSHZrm, 0 }, { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 }, { X86::VMINCSSrr, X86::VMINCSSrm, 0 }, { X86::VMINPDYrr, X86::VMINPDYrm, 0 }, @@ -1958,6 +1986,9 @@ { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 }, { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, { X86::VMINPDrr, X86::VMINPDrm, 0 }, + { X86::VMINPHZ128rr, X86::VMINPHZ128rm, 0 }, + { X86::VMINPHZ256rr, X86::VMINPHZ256rm, 0 }, + { X86::VMINPHZrr, X86::VMINPHZrm, 0 }, { X86::VMINPSYrr, X86::VMINPSYrm, 0 }, { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 }, { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 }, @@ -1967,6 +1998,8 @@ { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE }, { X86::VMINSDrr, X86::VMINSDrm, 0 }, { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE }, + { X86::VMINSHZrr, X86::VMINSHZrm, 0 }, + { X86::VMINSHZrr_Int, X86::VMINSHZrm_Int, TB_NO_REVERSE }, { X86::VMINSSZrr, X86::VMINSSZrm, 0 }, { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, @@ -2021,6 +2054,9 @@ { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 }, { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, { X86::VMULPDrr, X86::VMULPDrm, 0 }, + { X86::VMULPHZ128rr, X86::VMULPHZ128rm, 0 }, + { X86::VMULPHZ256rr, X86::VMULPHZ256rm, 0 }, + { X86::VMULPHZrr, X86::VMULPHZrm, 0 }, { X86::VMULPSYrr, X86::VMULPSYrm, 0 }, { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 }, { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 }, @@ -2030,6 +2066,8 @@ { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE }, { X86::VMULSDrr, X86::VMULSDrm, 0 }, { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE }, + { X86::VMULSHZrr, X86::VMULSHZrm, 0 }, + { X86::VMULSHZrr_Int, X86::VMULSHZrm_Int, TB_NO_REVERSE }, { X86::VMULSSZrr, X86::VMULSSZrm, 0 }, { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE }, { X86::VMULSSrr, X86::VMULSSrm, 0 }, @@ -2944,6 +2982,9 @@ { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 }, { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, + { X86::VSUBPHZ128rr, X86::VSUBPHZ128rm, 0 }, + { X86::VSUBPHZ256rr, X86::VSUBPHZ256rm, 0 }, + { X86::VSUBPHZrr, X86::VSUBPHZrm, 0 }, { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 }, { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 }, { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 }, @@ -2953,6 +2994,8 @@ { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE }, { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE }, + { X86::VSUBSHZrr, X86::VSUBSHZrm, 0 }, + { X86::VSUBSHZrr_Int, X86::VSUBSHZrm_Int, TB_NO_REVERSE }, { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 }, { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE }, { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, @@ -2999,10 +3042,14 @@ { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 }, { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 }, { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, + { X86::VADDPHZ128rrkz, X86::VADDPHZ128rmkz, 0 }, + { X86::VADDPHZ256rrkz, X86::VADDPHZ256rmkz, 0 }, + { X86::VADDPHZrrkz, X86::VADDPHZrmkz, 0 }, { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, { X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VADDSHZrr_Intkz, X86::VADDSHZrm_Intkz, TB_NO_REVERSE }, { X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE }, { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 }, { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 }, @@ -3041,10 +3088,14 @@ { X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmik, 0 }, { X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmik, 0 }, { X86::VCMPPDZrrik, X86::VCMPPDZrmik, 0 }, + { X86::VCMPPHZ128rrik, X86::VCMPPHZ128rmik, 0 }, + { X86::VCMPPHZ256rrik, X86::VCMPPHZ256rmik, 0 }, + { X86::VCMPPHZrrik, X86::VCMPPHZrmik, 0 }, { X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmik, 0 }, { X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmik, 0 }, { X86::VCMPPSZrrik, X86::VCMPPSZrmik, 0 }, { X86::VCMPSDZrr_Intk, X86::VCMPSDZrm_Intk, TB_NO_REVERSE }, + { X86::VCMPSHZrr_Intk, X86::VCMPSHZrm_Intk, TB_NO_REVERSE }, { X86::VCMPSSZrr_Intk, X86::VCMPSSZrm_Intk, TB_NO_REVERSE }, { X86::VCVTDQ2PDZ128rrk, X86::VCVTDQ2PDZ128rmk, TB_NO_REVERSE }, { X86::VCVTDQ2PDZ256rrk, X86::VCVTDQ2PDZ256rmk, 0 }, @@ -3141,10 +3192,14 @@ { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 }, { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 }, { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, + { X86::VDIVPHZ128rrkz, X86::VDIVPHZ128rmkz, 0 }, + { X86::VDIVPHZ256rrkz, X86::VDIVPHZ256rmkz, 0 }, + { X86::VDIVPHZrrkz, X86::VDIVPHZrmkz, 0 }, { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VDIVSHZrr_Intkz, X86::VDIVSHZrm_Intkz, TB_NO_REVERSE }, { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE }, { X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0 }, { X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0 }, @@ -3521,30 +3576,44 @@ { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 }, { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 }, { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 }, + { X86::VMAXCPHZ128rrkz, X86::VMAXCPHZ128rmkz, 0 }, + { X86::VMAXCPHZ256rrkz, X86::VMAXCPHZ256rmkz, 0 }, + { X86::VMAXCPHZrrkz, X86::VMAXCPHZrmkz, 0 }, { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 }, { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 }, { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 }, { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }, { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 }, { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, + { X86::VMAXPHZ128rrkz, X86::VMAXPHZ128rmkz, 0 }, + { X86::VMAXPHZ256rrkz, X86::VMAXPHZ256rmkz, 0 }, + { X86::VMAXPHZrrkz, X86::VMAXPHZrmkz, 0 }, { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 }, { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, { X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VMAXSHZrr_Intkz, X86::VMAXSHZrm_Intkz, TB_NO_REVERSE }, { X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, TB_NO_REVERSE }, { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 }, { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 }, { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 }, + { X86::VMINCPHZ128rrkz, X86::VMINCPHZ128rmkz, 0 }, + { X86::VMINCPHZ256rrkz, X86::VMINCPHZ256rmkz, 0 }, + { X86::VMINCPHZrrkz, X86::VMINCPHZrmkz, 0 }, { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 }, { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 }, { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 }, { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, + { X86::VMINPHZ128rrkz, X86::VMINPHZ128rmkz, 0 }, + { X86::VMINPHZ256rrkz, X86::VMINPHZ256rmkz, 0 }, + { X86::VMINPHZrrkz, X86::VMINPHZrmkz, 0 }, { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, { X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VMINSHZrr_Intkz, X86::VMINSHZrm_Intkz, TB_NO_REVERSE }, { X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, TB_NO_REVERSE }, { X86::VMOVAPDZ128rrk, X86::VMOVAPDZ128rmk, TB_NO_REVERSE | TB_ALIGN_16 }, { X86::VMOVAPDZ256rrk, X86::VMOVAPDZ256rmk, TB_NO_REVERSE | TB_ALIGN_32 }, @@ -3588,10 +3657,14 @@ { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, + { X86::VMULPHZ128rrkz, X86::VMULPHZ128rmkz, 0 }, + { X86::VMULPHZ256rrkz, X86::VMULPHZ256rmkz, 0 }, + { X86::VMULPHZrrkz, X86::VMULPHZrmkz, 0 }, { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, { X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VMULSHZrr_Intkz, X86::VMULSHZrm_Intkz, TB_NO_REVERSE }, { X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE }, { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 }, { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 }, @@ -4319,10 +4392,14 @@ { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, + { X86::VSUBPHZ128rrkz, X86::VSUBPHZ128rmkz, 0 }, + { X86::VSUBPHZ256rrkz, X86::VSUBPHZ256rmkz, 0 }, + { X86::VSUBPHZrrkz, X86::VSUBPHZrmkz, 0 }, { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, { X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VSUBSHZrr_Intkz, X86::VSUBSHZrm_Intkz, TB_NO_REVERSE }, { X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE }, { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 }, { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 }, @@ -4348,10 +4425,14 @@ { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 }, { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 }, { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, + { X86::VADDPHZ128rrk, X86::VADDPHZ128rmk, 0 }, + { X86::VADDPHZ256rrk, X86::VADDPHZ256rmk, 0 }, + { X86::VADDPHZrrk, X86::VADDPHZrmk, 0 }, { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, { X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE }, + { X86::VADDSHZrr_Intk, X86::VADDSHZrm_Intk, TB_NO_REVERSE }, { X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE }, { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 }, { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 }, @@ -4382,10 +4463,14 @@ { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 }, { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 }, { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, + { X86::VDIVPHZ128rrk, X86::VDIVPHZ128rmk, 0 }, + { X86::VDIVPHZ256rrk, X86::VDIVPHZ256rmk, 0 }, + { X86::VDIVPHZrrk, X86::VDIVPHZrmk, 0 }, { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE }, + { X86::VDIVSHZrr_Intk, X86::VDIVSHZrm_Intk, TB_NO_REVERSE }, { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE }, { X86::VDPBF16PSZ128rk, X86::VDPBF16PSZ128mk, 0 }, { X86::VDPBF16PSZ128rkz, X86::VDPBF16PSZ128mkz, 0 }, @@ -4701,38 +4786,56 @@ { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 }, { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 }, { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 }, + { X86::VMAXCPHZ128rrk, X86::VMAXCPHZ128rmk, 0 }, + { X86::VMAXCPHZ256rrk, X86::VMAXCPHZ256rmk, 0 }, + { X86::VMAXCPHZrrk, X86::VMAXCPHZrmk, 0 }, { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 }, { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 }, { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 }, { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }, { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 }, { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, + { X86::VMAXPHZ128rrk, X86::VMAXPHZ128rmk, 0 }, + { X86::VMAXPHZ256rrk, X86::VMAXPHZ256rmk, 0 }, + { X86::VMAXPHZrrk, X86::VMAXPHZrmk, 0 }, { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 }, { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, { X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, TB_NO_REVERSE }, + { X86::VMAXSHZrr_Intk, X86::VMAXSHZrm_Intk, TB_NO_REVERSE }, { X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, TB_NO_REVERSE }, { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 }, { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 }, { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 }, + { X86::VMINCPHZ128rrk, X86::VMINCPHZ128rmk, 0 }, + { X86::VMINCPHZ256rrk, X86::VMINCPHZ256rmk, 0 }, + { X86::VMINCPHZrrk, X86::VMINCPHZrmk, 0 }, { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 }, { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 }, { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 }, { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, + { X86::VMINPHZ128rrk, X86::VMINPHZ128rmk, 0 }, + { X86::VMINPHZ256rrk, X86::VMINPHZ256rmk, 0 }, + { X86::VMINPHZrrk, X86::VMINPHZrmk, 0 }, { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, { X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, TB_NO_REVERSE }, + { X86::VMINSHZrr_Intk, X86::VMINSHZrm_Intk, TB_NO_REVERSE }, { X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, TB_NO_REVERSE }, { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, + { X86::VMULPHZ128rrk, X86::VMULPHZ128rmk, 0 }, + { X86::VMULPHZ256rrk, X86::VMULPHZ256rmk, 0 }, + { X86::VMULPHZrrk, X86::VMULPHZrmk, 0 }, { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, { X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE }, + { X86::VMULSHZrr_Intk, X86::VMULSHZrm_Intk, TB_NO_REVERSE }, { X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE }, { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 }, { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 }, @@ -5248,10 +5351,14 @@ { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, + { X86::VSUBPHZ128rrk, X86::VSUBPHZ128rmk, 0 }, + { X86::VSUBPHZ256rrk, X86::VSUBPHZ256rmk, 0 }, + { X86::VSUBPHZrrk, X86::VSUBPHZrmk, 0 }, { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, { X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE }, + { X86::VSUBSHZrr_Intk, X86::VSUBSHZrm_Intk, TB_NO_REVERSE }, { X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE }, { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 }, { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 }, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2487,6 +2487,10 @@ case X86::VCMPSSZrr: case X86::VCMPPDZrri: case X86::VCMPPSZrri: + case X86::VCMPSHZrr: + case X86::VCMPPHZrri: + case X86::VCMPPHZ128rri: + case X86::VCMPPHZ256rri: case X86::VCMPPDZ128rri: case X86::VCMPPSZ128rri: case X86::VCMPPDZ256rri: @@ -6047,6 +6051,31 @@ } } + if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) { + // These instructions only load 16 bits, we can't fold them if the + // destination register is wider than 16 bits (2 bytes), and its user + // instruction isn't scalar (SH). + switch (UserOpc) { + case X86::VADDSHZrr_Int: + case X86::VCMPSHZrr_Int: + case X86::VDIVSHZrr_Int: + case X86::VMAXSHZrr_Int: + case X86::VMINSHZrr_Int: + case X86::VMULSHZrr_Int: + case X86::VSUBSHZrr_Int: + case X86::VADDSHZrr_Intk: case X86::VADDSHZrr_Intkz: + case X86::VCMPSHZrr_Intk: + case X86::VDIVSHZrr_Intk: case X86::VDIVSHZrr_Intkz: + case X86::VMAXSHZrr_Intk: case X86::VMAXSHZrr_Intkz: + case X86::VMINSHZrr_Intk: case X86::VMINSHZrr_Intkz: + case X86::VMULSHZrr_Intk: case X86::VMULSHZrr_Intkz: + case X86::VSUBSHZrr_Intk: case X86::VSUBSHZrr_Intkz: + return false; + default: + return true; + } + } + return false; } @@ -8401,6 +8430,14 @@ case X86::VMINCSSrr: case X86::VMINCSDZrr: case X86::VMINCSSZrr: + case X86::VMAXCPHZ128rr: + case X86::VMAXCPHZ256rr: + case X86::VMAXCPHZrr: + case X86::VMAXCSHZrr: + case X86::VMINCPHZ128rr: + case X86::VMINCPHZ256rr: + case X86::VMINCPHZrr: + case X86::VMINCSHZrr: return true; case X86::ADDPDrr: case X86::ADDPSrr: @@ -8438,6 +8475,14 @@ case X86::VMULSSrr: case X86::VMULSDZrr: case X86::VMULSSZrr: + case X86::VADDPHZ128rr: + case X86::VADDPHZ256rr: + case X86::VADDPHZrr: + case X86::VADDSHZrr: + case X86::VMULPHZ128rr: + case X86::VMULPHZ256rr: + case X86::VMULPHZrr: + case X86::VMULSHZrr: return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && Inst.getFlag(MachineInstr::MIFlag::FmNsz); default: diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -987,6 +987,34 @@ X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_256, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_512, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16), + X86_INTRINSIC_DATA(avx512fp16_add_ph_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512fp16_div_ph_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_add_sh_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FADDS, X86ISD::FADDS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_128, CMP_MASK_CC, X86ISD::CMPMM, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_256, CMP_MASK_CC, X86ISD::CMPMM, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_cmp_sh, CMP_MASK_SCALAR_CC, + X86ISD::FSETCCM, X86ISD::FSETCCM_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_div_sh_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FDIVS, X86ISD::FDIVS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_max_sh_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMAXS, X86ISD::FMAXS_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_min_sh_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMINS, X86ISD::FMINS_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_mul_sh_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FMULS, X86ISD::FMULS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_sub_sh_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FSUBS, X86ISD::FSUBS_RND), + X86_INTRINSIC_DATA(avx512fp16_max_ph_128, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512fp16_max_ph_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512fp16_max_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE), + X86_INTRINSIC_DATA(avx512fp16_min_ph_128, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512fp16_min_ph_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512fp16_min_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE), + X86_INTRINSIC_DATA(avx512fp16_mul_ph_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512fp16_sub_ph_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512fp16_vcomi_sh, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll @@ -0,0 +1,284 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s + +declare <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_int_x86_avx512fp16_add_ph_512(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_add_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_add_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_add_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddph (%rsi), %zmm1, %zmm1 {%k1} {z} +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x half>, <32 x half>* %ptr + %res0 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + %t2 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %val, i32 4) + %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer + %res3 = fadd <32 x half> %res1, %res2 + ret <32 x half> %res3 +} + +define <32 x half> @test_int_x86_avx512fp16_add_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_512_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %t1 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10) + %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src + ret <32 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_int_x86_avx512fp16_sub_ph_512(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_sub_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_sub_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_sub_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vsubph (%rsi), %zmm1, %zmm1 {%k1} {z} +; CHECK-NEXT: vsubph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x half>, <32 x half>* %ptr + %res0 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + %t2 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %val, i32 4) + %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer + %res3 = fsub <32 x half> %res1, %res2 + ret <32 x half> %res3 +} + +define <32 x half> @test_int_x86_avx512fp16_sub_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_512_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %t1 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10) + %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src + ret <32 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_int_x86_avx512fp16_mul_ph_512(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_mul_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_mul_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_mul_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vmulph (%rsi), %zmm1, %zmm1 {%k1} {z} +; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x half>, <32 x half>* %ptr + %res0 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + %t2 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %val, i32 4) + %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer + %res3 = fmul <32 x half> %res1, %res2 + ret <32 x half> %res3 +} + +define <32 x half> @test_int_x86_avx512fp16_mul_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_512_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %t1 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10) + %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src + ret <32 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_int_x86_avx512fp16_div_ph_512(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_div_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_div_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_div_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vdivph (%rsi), %zmm1, %zmm1 {%k1} {z} +; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x half>, <32 x half>* %ptr + %res0 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + %t2 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %val, i32 4) + %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer + %res3 = fdiv <32 x half> %res1, %res2 + ret <32 x half> %res3 +} + +define <32 x half> @test_int_x86_avx512fp16_div_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_512_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %t1 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10) + %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src + ret <32 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_min_ph(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_min_ph: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res0 = fcmp olt <32 x half> %x1, %x2 + %res1 = select <32 x i1> %res0, <32 x half> %x1, <32 x half> %x2 + ret <32 x half> %res1 +} + +define <32 x half> @test_int_x86_avx512fp16_min_ph_512_sae(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_min_ph_512_sae: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph {sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res0 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8) + ret <32 x half> %res0 +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_min_ph_512_sae(<32 x half> %x1, <32 x half> %x2, i32 %msk) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_min_ph_512_sae: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vminph {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + ret <32 x half> %res1 +} + +declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_max_ph(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_max_ph: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res0 = fcmp ogt <32 x half> %x1, %x2 + %res1 = select <32 x i1> %res0, <32 x half> %x1, <32 x half> %x2 + ret <32 x half> %res1 +} + +define <32 x half> @test_int_x86_avx512fp16_max_ph_512_sae(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_max_ph_512_sae: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph {sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res0 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8) + ret <32 x half> %res0 +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_max_ph_512_sae(<32 x half> %x1, <32 x half> %x2, i32 %msk) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_max_ph_512_sae: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxph {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + ret <32 x half> %res1 +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll @@ -0,0 +1,404 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512fp16 | FileCheck %s + +define <16 x half> @test_int_x86_avx512fp16_add_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = fadd <16 x half> %x1, %x2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_add_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %ymm2, %ymm3 +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vaddph (%rsi), %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vaddph %ymm2, %ymm3, %ymm0 +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %val = load <16 x half>, <16 x half>* %ptr + %res0 = fadd <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src + %t3 = fadd <16 x half> %x1, %val + %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src + %res = fadd <16 x half> %res1 , %res2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_maskz_add_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_add_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = fadd <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res1 +} + +define <8 x half> @test_int_x86_avx512fp16_add_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fadd <8 x half> %x1, %x2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_add_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vaddph (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddph %xmm2, %xmm3, %xmm0 +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %val = load <8 x half>, <8 x half>* %ptr + %res0 = fadd <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src + %t3 = fadd <8 x half> %x1, %val + %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src + %res = fadd <8 x half> %res1 , %res2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_maskz_add_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_add_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = fadd <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res1 +} + +define <16 x half> @test_int_x86_avx512fp16_sub_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = fsub <16 x half> %x1, %x2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_sub_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %ymm2, %ymm3 +; CHECK-NEXT: vsubph %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vsubph (%rsi), %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vsubph %ymm2, %ymm3, %ymm0 +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %val = load <16 x half>, <16 x half>* %ptr + %res0 = fsub <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src + %t3 = fsub <16 x half> %x1, %val + %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src + %res = fsub <16 x half> %res1 , %res2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_maskz_sub_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_sub_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubph %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = fsub <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res1 +} + +define <8 x half> @test_int_x86_avx512fp16_sub_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fsub <8 x half> %x1, %x2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_sub_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vsubph (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vsubph %xmm2, %xmm3, %xmm0 +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %val = load <8 x half>, <8 x half>* %ptr + %res0 = fsub <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src + %t3 = fsub <8 x half> %x1, %val + %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src + %res = fsub <8 x half> %res1 , %res2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_maskz_sub_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_sub_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = fsub <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res1 +} + +define <16 x half> @test_int_x86_avx512fp16_mul_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = fmul <16 x half> %x1, %x2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_mul_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %ymm2, %ymm3 +; CHECK-NEXT: vmulph %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vmulph (%rsi), %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmulph %ymm2, %ymm3, %ymm0 +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %val = load <16 x half>, <16 x half>* %ptr + %res0 = fmul <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src + %t3 = fmul <16 x half> %x1, %val + %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src + %res = fmul <16 x half> %res1 , %res2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_maskz_mul_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_mul_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulph %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = fmul <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res1 +} + +define <8 x half> @test_int_x86_avx512fp16_mul_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fmul <8 x half> %x1, %x2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_mul_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vmulph %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmulph (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmulph %xmm2, %xmm3, %xmm0 +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %val = load <8 x half>, <8 x half>* %ptr + %res0 = fmul <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src + %t3 = fmul <8 x half> %x1, %val + %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src + %res = fmul <8 x half> %res1 , %res2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_maskz_mul_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_mul_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulph %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = fmul <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res1 +} + +define <16 x half> @test_int_x86_avx512fp16_div_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = fdiv <16 x half> %x1, %x2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_div_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %ymm2, %ymm3 +; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vdivph (%rsi), %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vdivph %ymm2, %ymm3, %ymm0 +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %val = load <16 x half>, <16 x half>* %ptr + %res0 = fdiv <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src + %t3 = fdiv <16 x half> %x1, %val + %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src + %res = fdiv <16 x half> %res1 , %res2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_maskz_div_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_div_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = fdiv <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res1 +} + +define <8 x half> @test_int_x86_avx512fp16_div_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fdiv <8 x half> %x1, %x2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_div_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vdivph (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vdivph %xmm2, %xmm3, %xmm0 +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %val = load <8 x half>, <8 x half>* %ptr + %res0 = fdiv <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src + %t3 = fdiv <8 x half> %x1, %val + %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src + %res = fdiv <8 x half> %res1 , %res2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_maskz_div_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_div_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = fdiv <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res1 +} + +define <16 x half> @test_min_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_min_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res0 = fcmp olt <16 x half> %x1, %x2 + %res1 = select <16 x i1> %res0, <16 x half> %x1, <16 x half> %x2 + ret <16 x half> %res1 +} + +define <16 x half> @test_max_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_max_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res0 = fcmp ogt <16 x half> %x1, %x2 + %res1 = select <16 x i1> %res0, <16 x half> %x1, <16 x half> %x2 + ret <16 x half> %res1 +} + +define <8 x half> @test_min_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_min_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res0 = fcmp olt <8 x half> %x1, %x2 + %res1 = select <8 x i1> %res0, <8 x half> %x1, <8 x half> %x2 + ret <8 x half> %res1 +} + +define <8 x half> @test_max_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_max_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res0 = fcmp ogt <8 x half> %x1, %x2 + %res1 = select <8 x i1> %res0, <8 x half> %x1, <8 x half> %x2 + ret <8 x half> %res1 +} + +declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) +declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) + +define <8 x half> @test_max_ph_128_2(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_max_ph_128_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res0 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %x1, <8 x half> %x2) + ret <8 x half> %res0 +} + +define <16 x half> @test_max_ph_256_2(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_max_ph_256_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res0 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %x1, <16 x half> %x2) + ret <16 x half> %res0 +} + +declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) +declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) + +define <8 x half> @test_min_ph_128_2(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_min_ph_128_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res0 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %x1, <8 x half> %x2) + ret <8 x half> %res0 +} + +define <16 x half> @test_min_ph_256_2(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_min_ph_256_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res0 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %x1, <16 x half> %x2) + ret <16 x half> %res0 +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll @@ -0,0 +1,355 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=+avx512fp16 | FileCheck %s + +define <32 x half> @vaddph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone { +; CHECK-LABEL: vaddph_512_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %x = fadd <32 x half> %i, %j + ret <32 x half> %x +} + +define <32 x half> @vaddph_512_fold_test(<32 x half> %i, <32 x half>* %j) nounwind { +; CHECK-LABEL: vaddph_512_fold_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq + %tmp = load <32 x half>, <32 x half>* %j, align 4 + %x = fadd <32 x half> %i, %tmp + ret <32 x half> %x +} + +define <32 x half> @vaddph_512_broadc_test(<32 x half> %a) nounwind { +; CHECK-LABEL: vaddph_512_broadc_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm0, %zmm0 +; CHECK-NEXT: retq + %b = fadd <32 x half> %a, + ret <32 x half> %b +} + +define <16 x half> @vaddph_256_broadc_test(<16 x half> %a) nounwind { +; CHECK-LABEL: vaddph_256_broadc_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %ymm0, %ymm0 +; CHECK-NEXT: retq + %b = fadd <16 x half> %a, + ret <16 x half> %b +} + +define <8 x half> @vaddph_128_broadc_test(<8 x half> %a) nounwind { +; CHECK-LABEL: vaddph_128_broadc_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %xmm0 +; CHECK-NEXT: retq + %b = fadd <8 x half> %a, + ret <8 x half> %b +} + +define <32 x half> @vaddph_512_mask_test1(<32 x half> %i, <32 x half> %j, <32 x i1> %mask) nounwind readnone { +; CHECK-LABEL: vaddph_512_mask_test1: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsllw $7, %ymm2, %ymm2 +; CHECK-NEXT: vpmovb2m %ymm2, %k1 +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq + %x = fadd <32 x half> %i, %j + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> %i + ret <32 x half> %r +} + +define <32 x half> @vaddph_512_mask_test(<32 x half> %i, <32 x half> %j, <32 x half> %mask1) nounwind readnone { +; CHECK-LABEL: vaddph_512_mask_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpneq_oqph %zmm3, %zmm2, %k1 +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = fcmp one <32 x half> %mask1, zeroinitializer + %x = fadd <32 x half> %i, %j + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> %i + ret <32 x half> %r +} + +define <32 x half> @vaddph_512_maskz_test(<32 x half> %i, <32 x half> %j, <32 x half> %mask1) nounwind readnone { +; CHECK-LABEL: vaddph_512_maskz_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpneq_oqph %zmm3, %zmm2, %k1 +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fcmp one <32 x half> %mask1, zeroinitializer + %x = fadd <32 x half> %i, %j + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> zeroinitializer + ret <32 x half> %r +} + +define <32 x half> @vaddph_512_mask_fold_test(<32 x half> %i, <32 x half>* %j.ptr, <32 x half> %mask1) nounwind readnone { +; CHECK-LABEL: vaddph_512_mask_fold_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpneq_oqph %zmm2, %zmm1, %k1 +; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = fcmp one <32 x half> %mask1, zeroinitializer + %j = load <32 x half>, <32 x half>* %j.ptr + %x = fadd <32 x half> %i, %j + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> %i + ret <32 x half> %r +} + +define <32 x half> @vaddph_512_maskz_fold_test(<32 x half> %i, <32 x half>* %j.ptr, <32 x half> %mask1) nounwind readnone { +; CHECK-LABEL: vaddph_512_maskz_fold_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpneq_oqph %zmm2, %zmm1, %k1 +; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fcmp one <32 x half> %mask1, zeroinitializer + %j = load <32 x half>, <32 x half>* %j.ptr + %x = fadd <32 x half> %i, %j + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> zeroinitializer + ret <32 x half> %r +} + +define <32 x half> @vaddph_512_maskz_fold_test_2(<32 x half> %i, <32 x half>* %j.ptr, <32 x half> %mask1) nounwind readnone { +; CHECK-LABEL: vaddph_512_maskz_fold_test_2: +; CHECK: ## %bb.0: +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpneq_oqph %zmm2, %zmm1, %k1 +; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fcmp one <32 x half> %mask1, zeroinitializer + %j = load <32 x half>, <32 x half>* %j.ptr + %x = fadd <32 x half> %j, %i + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> zeroinitializer + ret <32 x half> %r +} + +define <32 x half> @vsubph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone { +; CHECK-LABEL: vsubph_512_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %x = fsub <32 x half> %i, %j + ret <32 x half> %x +} + +define <32 x half> @vmulph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone { +; CHECK-LABEL: vmulph_512_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %x = fmul <32 x half> %i, %j + ret <32 x half> %x +} + +define <32 x half> @vdivph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone { +; CHECK-LABEL: vdivph_512_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %x = fdiv <32 x half> %i, %j + ret <32 x half> %x +} + +define half @add_sh(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: add_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fadd half %i, %j + %r = fadd half %x, %y + ret half %r +} + +define half @sub_sh(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: sub_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovsh (%rdi), %xmm2 +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vsubsh %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fsub half %i, %j + %r = fsub half %x, %y + ret half %r +} + +define half @sub_sh_2(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: sub_sh_2: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vsubsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fsub half %i, %j + %r = fsub half %y, %x + ret half %r +} + +define half @mul_sh(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: mul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmulsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmulsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fmul half %i, %j + %r = fmul half %x, %y + ret half %r +} + +define half @div_sh(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: div_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovsh (%rdi), %xmm2 +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vdivsh %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fdiv half %i, %j + %r = fdiv half %x, %y + ret half %r +} + +define half @div_sh_2(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: div_sh_2: +; CHECK: ## %bb.0: +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vdivsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fdiv half %i, %j + %r = fdiv half %y, %x + ret half %r +} + +define i1 @cmp_une_sh(half %x, half %y) { +; CHECK-LABEL: cmp_une_sh: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpneqsh %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +entry: + %0 = fcmp une half %x, %y + ret i1 %0 +} + +define i1 @cmp_oeq_sh(half %x, half %y) { +; CHECK-LABEL: cmp_oeq_sh: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqsh %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +entry: + %0 = fcmp oeq half %x, %y + ret i1 %0 +} + +define i1 @cmp_olt_sh(half %x, half %y) { +; CHECK-LABEL: cmp_olt_sh: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vucomish %xmm0, %xmm1 +; CHECK-NEXT: seta %al +; CHECK-NEXT: retq + entry: + %0 = fcmp olt half %x, %y + ret i1 %0 +} + +define <32 x i1> @cmp_ph(<32 x half> %x, <32 x half> %y) { +; CHECK-LABEL: cmp_ph: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpneqph %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpmovm2b %k0, %ymm0 +; CHECK-NEXT: retq +entry: + %0 = fcmp une <32 x half> %x, %y + ret <32 x i1> %0 +} + +define <8 x i1> @fcmp_v8f16(<8 x half> %a, <8 x half> %b) +; CHECK-LABEL: fcmp_v8f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpmovm2w %k0, %xmm0 +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <8 x half> %a, %b + ret <8 x i1> %0 +} + +define <16 x i1> @fcmp_v16f16(<16 x half> %a, <16 x half> %b) +; CHECK-LABEL: fcmp_v16f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpmovm2b %k0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <16 x half> %a, %b + ret <16 x i1> %0 +} + +define <32 x i1> @fcmp_v32f16(<32 x half> %a, <32 x half> %b) +; CHECK-LABEL: fcmp_v32f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpmovm2b %k0, %ymm0 +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <32 x half> %a, %b + ret <32 x i1> %0 +} + +define <8 x i16> @zext_fcmp_v8f16(<8 x half> %a, <8 x half> %b) +; CHECK-LABEL: zext_fcmp_v8f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpmovm2w %k0, %xmm0 +; CHECK-NEXT: vpsrlw $15, %xmm0, %xmm0 +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <8 x half> %a, %b + %1 = zext <8 x i1> %0 to <8 x i16> + ret <8 x i16> %1 +} + +define <16 x i16> @zext_fcmp_v16f16(<16 x half> %a, <16 x half> %b) +; CHECK-LABEL: zext_fcmp_v16f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpmovm2w %k0, %ymm0 +; CHECK-NEXT: vpsrlw $15, %ymm0, %ymm0 +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <16 x half> %a, %b + %1 = zext <16 x i1> %0 to <16 x i16> + ret <16 x i16> %1 +} + +define <32 x i16> @zext_fcmp_v32f16(<32 x half> %a, <32 x half> %b) +; CHECK-LABEL: zext_fcmp_v32f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpmovm2w %k0, %zmm0 +; CHECK-NEXT: vpsrlw $15, %zmm0, %zmm0 +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <32 x half> %a, %b + %1 = zext <32 x i1> %0 to <32 x i16> + ret <32 x i16> %1 +} + diff --git a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK + +declare half @llvm.maxnum.f16(half, half) +declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) +declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>) +declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>) +declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>) +declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>) + +define half @test_intrinsic_fmaxh(half %x, half %y) { +; CHECK-LABEL: test_intrinsic_fmaxh: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0] +; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call half @llvm.maxnum.f16(half %x, half %y) readnone + ret half %z +} + +define <2 x half> @test_intrinsic_fmax_v2f16(<2 x half> %x, <2 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmax_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %x, <2 x half> %y) readnone + ret <2 x half> %z +} + +define <4 x half> @test_intrinsic_fmax_v4f16(<4 x half> %x, <4 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmax_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %x, <4 x half> %y) readnone + ret <4 x half> %z +} + +define <8 x half> @test_intrinsic_fmax_v8f16(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmax_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %x, <8 x half> %y) readnone + ret <8 x half> %z +} + +define <16 x half> @test_intrinsic_fmax_v16f16(<16 x half> %x, <16 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmax_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5f,0xd0] +; CHECK-NEXT: vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %x, <16 x half> %y) readnone + ret <16 x half> %z +} + +define <32 x half> @test_intrinsic_fmax_v32f16(<32 x half> %x, <32 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmax_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm2 # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0] +; CHECK-NEXT: vcmpunordph %zmm0, %zmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1] +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <32 x half> @llvm.maxnum.v32f16(<32 x half> %x, <32 x half> %y) readnone + ret <32 x half> %z +} + +define <4 x half> @maxnum_intrinsic_nnan_fmf_f432(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: maxnum_intrinsic_nnan_fmf_f432: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5f,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %r = tail call nnan <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %r +} + +define half @maxnum_intrinsic_nnan_attr_f16(half %a, half %b) #0 { +; CHECK-LABEL: maxnum_intrinsic_nnan_attr_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %r = tail call half @llvm.maxnum.f16(half %a, half %b) + ret half %r +} + +define half @test_maxnum_const_op1(half %x) { +; CHECK-LABEL: test_maxnum_const_op1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.maxnum.f16(half 1.0, half %x) + ret half %r +} + +define half @test_maxnum_const_op2(half %x) { +; CHECK-LABEL: test_maxnum_const_op2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.maxnum.f16(half %x, half 1.0) + ret half %r +} + +define half @test_maxnum_const_nan(half %x) { +; CHECK-LABEL: test_maxnum_const_nan: +; CHECK: # %bb.0: +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.maxnum.f16(half %x, half 0x7fff000000000000) + ret half %r +} + +attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true"} diff --git a/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll b/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK + +declare half @llvm.minnum.f16(half, half) +declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) +declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>) +declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>) +declare <16 x half> @llvm.minnum.v16f16(<16 x half>, <16 x half>) +declare <32 x half> @llvm.minnum.v32f16(<32 x half>, <32 x half>) + +define half @test_intrinsic_fminh(half %x, half %y) { +; CHECK-LABEL: test_intrinsic_fminh: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xd0] +; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call half @llvm.minnum.f16(half %x, half %y) readnone + ret half %z +} + +define <2 x half> @test_intrinsic_fmin_v2f16(<2 x half> %x, <2 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmin_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <2 x half> @llvm.minnum.v2f16(<2 x half> %x, <2 x half> %y) readnone + ret <2 x half> %z +} + +define <4 x half> @test_intrinsic_fmin_v4f16(<4 x half> %x, <4 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmin_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <4 x half> @llvm.minnum.v4f16(<4 x half> %x, <4 x half> %y) readnone + ret <4 x half> %z +} + +define <8 x half> @test_intrinsic_fmin_v8f16(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmin_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <8 x half> @llvm.minnum.v8f16(<8 x half> %x, <8 x half> %y) readnone + ret <8 x half> %z +} + +define <16 x half> @test_intrinsic_fmin_v16f16(<16 x half> %x, <16 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmin_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5d,0xd0] +; CHECK-NEXT: vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <16 x half> @llvm.minnum.v16f16(<16 x half> %x, <16 x half> %y) readnone + ret <16 x half> %z +} + +define <32 x half> @test_intrinsic_fmin_v32f16(<32 x half> %x, <32 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmin_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm2 # encoding: [0x62,0xf5,0x74,0x48,0x5d,0xd0] +; CHECK-NEXT: vcmpunordph %zmm0, %zmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1] +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <32 x half> @llvm.minnum.v32f16(<32 x half> %x, <32 x half> %y) readnone + ret <32 x half> %z +} + +define <4 x half> @minnum_intrinsic_nnan_fmf_f432(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: minnum_intrinsic_nnan_fmf_f432: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5d,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %r = tail call nnan <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %r +} + +define half @minnum_intrinsic_nnan_attr_f16(half %a, half %b) #0 { +; CHECK-LABEL: minnum_intrinsic_nnan_attr_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5d,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %r = tail call half @llvm.minnum.f16(half %a, half %b) + ret half %r +} + +define half @test_minnum_const_op1(half %x) { +; CHECK-LABEL: test_minnum_const_op1: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5d,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.minnum.f16(half 1.0, half %x) + ret half %r +} + +define half @test_minnum_const_op2(half %x) { +; CHECK-LABEL: test_minnum_const_op2: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5d,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.minnum.f16(half %x, half 1.0) + ret half %r +} + +define half @test_minnum_const_nan(half %x) { +; CHECK-LABEL: test_minnum_const_nan: +; CHECK: # %bb.0: +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.minnum.f16(half %x, half 0x7fff000000000000) + ret half %r +} + +attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll b/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s + +; Verify that we're folding the load into the math instruction. +; This pattern is generated out of the simplest intrinsics usage: +; _mm_add_ss(a, _mm_load_ss(b)); + +define <8 x half> @addsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: addsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = fadd half %a, %b + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +define <8 x half> @subsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: subsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = fsub half %a, %b + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +define <8 x half> @mulsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: mulsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = fmul half %a, %b + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +define <8 x half> @divsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: divsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = fdiv half %a, %b + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +define <8 x half> @minsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: minsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = call nnan half @llvm.minnum.f16(half %a, half %b) readnone + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +define <8 x half> @maxsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: maxsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = call nnan half @llvm.minnum.f16(half %a, half %b) readnone + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +declare half @llvm.minnum.f16(half, half) +declare half @llvm.maxnum.f16(half, half) diff --git a/llvm/test/CodeGen/X86/avx512fp16-fold-xmm-zero.ll b/llvm/test/CodeGen/X86/avx512fp16-fold-xmm-zero.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fold-xmm-zero.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -mtriple=i386-apple-macosx10.6.7 -mattr=+avx512fp16 -no-integrated-as | FileCheck %s + +; Simple test to make sure folding for special constants (like half zero) +; isn't completely broken. + +; CHECK: vdivsh LCPI0 + +%0 = type { half, half, half, half, half, half, half, half } + +define void @f() nounwind ssp { +entry: + %0 = tail call %0 asm sideeffect "foo", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(half 1.000000e+00, half 2.000000e+00, half 3.000000e+00, half 4.000000e+00, half 5.000000e+00, half 6.000000e+00, half 7.000000e+00, half 8.000000e+00) nounwind + %asmresult = extractvalue %0 %0, 0 + %asmresult8 = extractvalue %0 %0, 1 + %asmresult9 = extractvalue %0 %0, 2 + %asmresult10 = extractvalue %0 %0, 3 + %asmresult11 = extractvalue %0 %0, 4 + %asmresult12 = extractvalue %0 %0, 5 + %asmresult13 = extractvalue %0 %0, 6 + %asmresult14 = extractvalue %0 %0, 7 + %div = fdiv half %asmresult, 0.000000e+00 + %1 = tail call %0 asm sideeffect "bar", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(half %div, half %asmresult8, half %asmresult9, half %asmresult10, half %asmresult11, half %asmresult12, half %asmresult13, half %asmresult14) nounwind + %asmresult24 = extractvalue %0 %1, 0 + %asmresult25 = extractvalue %0 %1, 1 + %asmresult26 = extractvalue %0 %1, 2 + %asmresult27 = extractvalue %0 %1, 3 + %asmresult28 = extractvalue %0 %1, 4 + %asmresult29 = extractvalue %0 %1, 5 + %asmresult30 = extractvalue %0 %1, 6 + %asmresult31 = extractvalue %0 %1, 7 + %div33 = fdiv half %asmresult24, 0.000000e+00 + %2 = tail call %0 asm sideeffect "baz", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(half %div33, half %asmresult25, half %asmresult26, half %asmresult27, half %asmresult28, half %asmresult29, half %asmresult30, half %asmresult31) nounwind + ret void +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll @@ -0,0 +1,381 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s + +; Test cases derived from float/double tests in fp-logic.ll + +; 1 FP operand, 1 int operand, int result + +define i16 @f1(half %x, i16 %y) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: andl %edi, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, %y + ret i16 %and +} + +; Swap operands of the logic op. + +define i16 @f2(half %x, i16 %y) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: andl %edi, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %y, %bc1 + ret i16 %and +} + +; 1 FP operand, 1 constant operand, int result + +define i16 @f3(half %x) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, 1 + ret i16 %and +} + +; Swap operands of the logic op. + +define i16 @f4(half %x) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: andl $2, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 2, %bc1 + ret i16 %and +} + +; 1 FP operand, 1 integer operand, FP result + +define half @f5(half %x, i16 %y) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %edi, %xmm1 +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, %y + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +; Swap operands of the logic op. + +define half @f6(half %x, i16 %y) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %edi, %xmm1 +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %y, %bc1 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +; 1 FP operand, 1 constant operand, FP result + +define half @f7(half %x) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, 3 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +; Swap operands of the logic op. + +define half @f8(half %x) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 4, %bc1 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +; 2 FP operands, int result + +define i16 @f9(half %x, half %y) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %bc2 = bitcast half %y to i16 + %and = and i16 %bc1, %bc2 + ret i16 %and +} + +; 2 FP operands, FP result + +define half @f10(half %x, half %y) { +; CHECK-LABEL: f10: +; CHECK: # %bb.0: +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %bc2 = bitcast half %y to i16 + %and = and i16 %bc1, %bc2 + %bc3 = bitcast i16 %and to half + ret half %bc3 +} + +define half @or(half %x, half %y) { +; CHECK-LABEL: or: +; CHECK: # %bb.0: +; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %bc2 = bitcast half %y to i16 + %and = or i16 %bc1, %bc2 + %bc3 = bitcast i16 %and to half + ret half %bc3 +} + +define half @xor(half %x, half %y) { +; CHECK-LABEL: xor: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %bc2 = bitcast half %y to i16 + %and = xor i16 %bc1, %bc2 + %bc3 = bitcast i16 %and to half + ret half %bc3 +} + +define half @f7_or(half %x) { +; CHECK-LABEL: f7_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = or i16 %bc1, 3 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +define half @f7_xor(half %x) { +; CHECK-LABEL: f7_xor: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = xor i16 %bc1, 3 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +; Grabbing the sign bit is a special case that could be handled +; by movmskps/movmskpd, but if we're not shifting it over, then +; a simple FP logic op is cheaper. + +define half @movmsk(half %x) { +; CHECK-LABEL: movmsk: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, 32768 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +define half @bitcast_fabs(half %x) { +; CHECK-LABEL: bitcast_fabs: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, 32767 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +define half @bitcast_fneg(half %x) { +; CHECK-LABEL: bitcast_fneg: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %xor = xor i16 %bc1, 32768 + %bc2 = bitcast i16 %xor to half + ret half %bc2 +} + +define <8 x half> @bitcast_fabs_vec(<8 x half> %x) { +; CHECK-LABEL: bitcast_fabs_vec: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %x to <8 x i16> + %and = and <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %and to <8 x half> + ret <8 x half> %bc2 +} + +define <8 x half> @bitcast_fneg_vec(<8 x half> %x) { +; CHECK-LABEL: bitcast_fneg_vec: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %x to <8 x i16> + %xor = xor <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %xor to <8 x half> + ret <8 x half> %bc2 +} + +define half @fadd_bitcast_fneg(half %x, half %y) { +; CHECK-LABEL: fadd_bitcast_fneg: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %y to i16 + %xor = xor i16 %bc1, 32768 + %bc2 = bitcast i16 %xor to half + %fadd = fadd half %x, %bc2 + ret half %fadd +} + +define half @fsub_bitcast_fneg(half %x, half %y) { +; CHECK-LABEL: fsub_bitcast_fneg: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: vxorps %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %y to i16 + %xor = xor i16 %bc1, 32767 + %bc2 = bitcast i16 %xor to half + %fsub = fsub half %x, %bc2 + ret half %fsub +} + +define half @nabs(half %a) { +; CHECK-LABEL: nabs: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %conv = bitcast half %a to i16 + %and = or i16 %conv, -32768 + %conv1 = bitcast i16 %and to half + ret half %conv1 +} + +define <8 x half> @nabsv8f16(<8 x half> %a) { +; CHECK-LABEL: nabsv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %conv = bitcast <8 x half> %a to <8 x i16> + %and = or <8 x i16> %conv, + %conv1 = bitcast <8 x i16> %and to <8 x half> + ret <8 x half> %conv1 +} + +define <8 x half> @fadd_bitcast_fneg_vec(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fadd_bitcast_fneg_vec: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <8 x i16> + %xor = xor <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %xor to <8 x half> + %fadd = fadd <8 x half> %x, %bc2 + ret <8 x half> %fadd +} + +define <8 x half> @fadd_bitcast_fneg_vec_undef_elts(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fadd_bitcast_fneg_vec_undef_elts: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <8 x i16> + %xor = xor <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %xor to <8 x half> + %fadd = fadd <8 x half> %x, %bc2 + ret <8 x half> %fadd +} + +define <8 x half> @fsub_bitcast_fneg_vec(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fsub_bitcast_fneg_vec: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <8 x i16> + %xor = xor <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %xor to <8 x half> + %fsub = fsub <8 x half> %x, %bc2 + ret <8 x half> %fsub +} + +define <8 x half> @fsub_bitcast_fneg_vec_undef_elts(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fsub_bitcast_fneg_vec_undef_elts: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <8 x i16> + %xor = xor <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %xor to <8 x half> + %fsub = fsub <8 x half> %x, %bc2 + ret <8 x half> %fsub +} + +define <8 x half> @fadd_bitcast_fneg_vec_width(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fadd_bitcast_fneg_vec_width: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <2 x i64> + %xor = xor <2 x i64> %bc1, + %bc2 = bitcast <2 x i64> %xor to <8 x half> + %fadd = fadd <8 x half> %x, %bc2 + ret <8 x half> %fadd +} + +define <8 x half> @fsub_bitcast_fneg_vec_width(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fsub_bitcast_fneg_vec_width: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <2 x i64> + %xor = xor <2 x i64> %bc1, + %bc2 = bitcast <2 x i64> %xor to <8 x half> + %fsub = fsub <8 x half> %x, %bc2 + ret <8 x half> %fsub +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s + +declare i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half>, <8 x half>, i32, i32) + +define i32 @test_x86_avx512fp16_ucomi_sh_lt(<8 x half> %a0, <8 x half> %a1) { +; CHECK-LABEL: test_x86_avx512fp16_ucomi_sh_lt: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpngesh %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %a0, <8 x half> %a1, i32 9, i32 4) + ret i32 %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vaddsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vaddsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vsubsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vsubsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vmulsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmulsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmulsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vdivsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vdivsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_min_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vminsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vminsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_max_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmaxsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmaxsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half>, <8 x half>, i32, i8, i32) + +define i8 @test_int_x86_avx512_mask_cmp_sh(<8 x half> %x0, <8 x half> %x1, i8 %x3, i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpunordsh %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq + %res2 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 3, i8 %x3, i32 4) + ret i8 %res2 +} + + +define i8 @test_int_x86_avx512_mask_cmp_sh_all(<8 x half> %x0, <8 x half> %x1, i8 %x3, i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sh_all: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmplesh %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vcmpunordsh {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %edx +; CHECK-NEXT: vcmpneqsh %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %esi +; CHECK-NEXT: vcmpnltsh {sae}, %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: andb %sil, %al +; CHECK-NEXT: andb %dl, %al +; CHECK-NEXT: andb %cl, %al +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq + %res1 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 2, i8 -1, i32 4) + %res2 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 3, i8 -1, i32 8) + %res3 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 4, i8 %x3, i32 4) + %res4 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 5, i8 %x3, i32 8) + + %res11 = and i8 %res1, %res2 + %res12 = and i8 %res3, %res4 + %res13 = and i8 %res11, %res12 + ret i8 %res13 +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll b/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll @@ -0,0 +1,345 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s + +; Incremental updates of the instruction depths should be enough for this test +; case. +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s + +; Verify that the first two adds are independent regardless of how the inputs are +; commuted. The destination registers are used as source registers for the third add. + +define half @reassociate_adds1(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_adds1: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %t0, %x2 + %t2 = fadd reassoc nsz half %t1, %x3 + ret half %t2 +} + +define half @reassociate_adds2(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_adds2: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %x2, %t0 + %t2 = fadd reassoc nsz half %t1, %x3 + ret half %t2 +} + +define half @reassociate_adds3(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_adds3: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %t0, %x2 + %t2 = fadd reassoc nsz half %x3, %t1 + ret half %t2 +} + +define half @reassociate_adds4(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_adds4: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %x2, %t0 + %t2 = fadd reassoc nsz half %x3, %t1 + ret half %t2 +} + +; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not +; produced because that would cost more compile time. + +define half @reassociate_adds5(half %x0, half %x1, half %x2, half %x3, half %x4, half %x5, half %x6, half %x7) { +; CHECK-LABEL: reassociate_adds5: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm5, %xmm4, %xmm1 +; CHECK-NEXT: vaddsh %xmm6, %xmm1, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm7, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %t0, %x2 + %t2 = fadd reassoc nsz half %t1, %x3 + %t3 = fadd reassoc nsz half %t2, %x4 + %t4 = fadd reassoc nsz half %t3, %x5 + %t5 = fadd reassoc nsz half %t4, %x6 + %t6 = fadd reassoc nsz half %t5, %x7 + ret half %t6 +} + +; Verify that we only need two associative operations to reassociate the operands. +; Also, we should reassociate such that the result of the high latency division +; is used by the final 'add' rather than reassociating the %x3 operand with the +; division. The latter reassociation would not improve anything. + +define half @reassociate_adds6(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_adds6: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fdiv reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %x2, %t0 + %t2 = fadd reassoc nsz half %x3, %t1 + ret half %t2 +} + +; Verify that SSE and AVX scalar single-precision multiplies are reassociated. + +define half @reassociate_muls1(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_muls1: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmulsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vmulsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fdiv reassoc nsz half %x0, %x1 + %t1 = fmul reassoc nsz half %x2, %t0 + %t2 = fmul reassoc nsz half %x3, %t1 + ret half %t2 +} + +; Verify that SSE and AVX 128-bit vector half-precision adds are reassociated. + +define <8 x half> @reassociate_adds_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { +; CHECK-LABEL: reassociate_adds_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddph %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fdiv reassoc nsz <8 x half> %x0, %x1 + %t1 = fadd reassoc nsz <8 x half> %x2, %t0 + %t2 = fadd reassoc nsz <8 x half> %x3, %t1 + ret <8 x half> %t2 +} + +; Verify that SSE and AVX 128-bit vector half-precision multiplies are reassociated. + +define <8 x half> @reassociate_muls_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { +; CHECK-LABEL: reassociate_muls_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmulph %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vmulph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz <8 x half> %x0, %x1 + %t1 = fmul reassoc nsz <8 x half> %x2, %t0 + %t2 = fmul reassoc nsz <8 x half> %x3, %t1 + ret <8 x half> %t2 +} + +; Verify that AVX 256-bit vector half-precision adds are reassociated. + +define <16 x half> @reassociate_adds_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { +; CHECK-LABEL: reassociate_adds_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddph %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t0 = fdiv reassoc nsz <16 x half> %x0, %x1 + %t1 = fadd reassoc nsz <16 x half> %x2, %t0 + %t2 = fadd reassoc nsz <16 x half> %x3, %t1 + ret <16 x half> %t2 +} + +; Verify that AVX 256-bit vector half-precision multiplies are reassociated. + +define <16 x half> @reassociate_muls_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { +; CHECK-LABEL: reassociate_muls_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmulph %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vmulph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz <16 x half> %x0, %x1 + %t1 = fmul reassoc nsz <16 x half> %x2, %t0 + %t2 = fmul reassoc nsz <16 x half> %x3, %t1 + ret <16 x half> %t2 +} + +; Verify that AVX512 512-bit vector half-precision adds are reassociated. + +define <32 x half> @reassociate_adds_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { +; CHECK-LABEL: reassociate_adds_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddph %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t0 = fdiv reassoc nsz <32 x half> %x0, %x1 + %t1 = fadd reassoc nsz <32 x half> %x2, %t0 + %t2 = fadd reassoc nsz <32 x half> %x3, %t1 + ret <32 x half> %t2 +} + +; Verify that AVX512 512-bit vector half-precision multiplies are reassociated. + +define <32 x half> @reassociate_muls_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { +; CHECK-LABEL: reassociate_muls_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmulph %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz <32 x half> %x0, %x1 + %t1 = fmul reassoc nsz <32 x half> %x2, %t0 + %t2 = fmul reassoc nsz <32 x half> %x3, %t1 + ret <32 x half> %t2 +} + +; Verify that SSE and AVX scalar half-precision minimum ops are reassociated. + +define half @reassociate_mins_half(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_mins_half: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vminsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fdiv half %x0, %x1 + %cmp1 = fcmp olt half %x2, %t0 + %sel1 = select i1 %cmp1, half %x2, half %t0 + %cmp2 = fcmp olt half %x3, %sel1 + %sel2 = select i1 %cmp2, half %x3, half %sel1 + ret half %sel2 +} + +; Verify that SSE and AVX scalar half-precision maximum ops are reassociated. + +define half @reassociate_maxs_half(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_maxs_half: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmaxsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fdiv half %x0, %x1 + %cmp1 = fcmp ogt half %x2, %t0 + %sel1 = select i1 %cmp1, half %x2, half %t0 + %cmp2 = fcmp ogt half %x3, %sel1 + %sel2 = select i1 %cmp2, half %x3, half %sel1 + ret half %sel2 +} + +; Verify that SSE and AVX 128-bit vector half-precision minimum ops are reassociated. + +define <8 x half> @reassociate_mins_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { +; CHECK-LABEL: reassociate_mins_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vminph %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd <8 x half> %x0, %x1 + %cmp1 = fcmp olt <8 x half> %x2, %t0 + %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0 + %cmp2 = fcmp olt <8 x half> %x3, %sel1 + %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1 + ret <8 x half> %sel2 +} + +; Verify that SSE and AVX 128-bit vector half-precision maximum ops are reassociated. + +define <8 x half> @reassociate_maxs_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { +; CHECK-LABEL: reassociate_maxs_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmaxph %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd <8 x half> %x0, %x1 + %cmp1 = fcmp ogt <8 x half> %x2, %t0 + %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0 + %cmp2 = fcmp ogt <8 x half> %x3, %sel1 + %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1 + ret <8 x half> %sel2 +} + +; Verify that AVX 256-bit vector half-precision minimum ops are reassociated. + +define <16 x half> @reassociate_mins_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { +; CHECK-LABEL: reassociate_mins_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vminph %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vminph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t0 = fadd <16 x half> %x0, %x1 + %cmp1 = fcmp olt <16 x half> %x2, %t0 + %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0 + %cmp2 = fcmp olt <16 x half> %x3, %sel1 + %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1 + ret <16 x half> %sel2 +} + +; Verify that AVX 256-bit vector half-precision maximum ops are reassociated. + +define <16 x half> @reassociate_maxs_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { +; CHECK-LABEL: reassociate_maxs_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmaxph %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vmaxph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t0 = fadd <16 x half> %x0, %x1 + %cmp1 = fcmp ogt <16 x half> %x2, %t0 + %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0 + %cmp2 = fcmp ogt <16 x half> %x3, %sel1 + %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1 + ret <16 x half> %sel2 +} + +; Verify that AVX512 512-bit vector half-precision minimum ops are reassociated. + +define <32 x half> @reassociate_mins_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { +; CHECK-LABEL: reassociate_mins_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vminph %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vminph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t0 = fadd <32 x half> %x0, %x1 + %cmp1 = fcmp olt <32 x half> %x2, %t0 + %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0 + %cmp2 = fcmp olt <32 x half> %x3, %sel1 + %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1 + ret <32 x half> %sel2 +} + +; Verify that AVX512 512-bit vector half-precision maximum ops are reassociated. + +define <32 x half> @reassociate_maxs_v16f32(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { +; CHECK-LABEL: reassociate_maxs_v16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmaxph %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vmaxph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t0 = fadd <32 x half> %x0, %x1 + %cmp1 = fcmp ogt <32 x half> %x2, %t0 + %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0 + %cmp2 = fcmp ogt <32 x half> %x3, %sel1 + %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1 + ret <32 x half> %sel2 +} + diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1217,6 +1217,19 @@ ret <8 x half> %res } +define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: movsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11] +; CHECK-NEXT: vmovsh %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddph %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + %res = fadd <8 x half> %res1, %res2 + ret <8 x half> %res +} + define i16 @test_movw(half %x) { ; X64-LABEL: test_movw: ; X64: # %bb.0: @@ -1885,3 +1898,31 @@ %18 = fmul contract <4 x float> %17, ret <4 x float> %18 } + +; Make sure load/stores of v4f16 are handled well on 32-bit targets where +; default widening legalization can't use i64. +define void @load_store_v4f16(<4 x half>* %x, <4 x half>* %y, <4 x half>* %z) { +; X64-LABEL: load_store_v4f16: +; X64: # %bb.0: +; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; X64-NEXT: vmovlps %xmm0, (%rdx) +; X64-NEXT: retq +; +; X86-LABEL: load_store_v4f16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; X86-NEXT: vmovlps %xmm0, (%eax) +; X86-NEXT: retl + %a = load <4 x half>, <4 x half>* %x + %b = load <4 x half>, <4 x half>* %y + %c = fadd <4 x half> %a, %b + store <4 x half> %c, <4 x half>* %z + ret void +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll @@ -0,0 +1,141 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=+avx512fp16 | FileCheck %s --check-prefix=CHECK_UNSAFE +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512fp16 | FileCheck %s --check-prefix=CHECK + +define <32 x half> @test_max_v32f16(<32 x half> * %a_ptr, <32 x half> %b) { +; CHECK_UNSAFE-LABEL: test_max_v32f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vmaxph (%rdi), %zmm0, %zmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_max_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %a = load <32 x half>, <32 x half>* %a_ptr + %tmp = fcmp fast ogt <32 x half> %a, %b + %tmp4 = select <32 x i1> %tmp, <32 x half> %a, <32 x half> %b + ret <32 x half> %tmp4; +} + +define <32 x half> @test_min_v32f16(<32 x half>* %a_ptr, <32 x half> %b) { +; CHECK_UNSAFE-LABEL: test_min_v32f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vminph (%rdi), %zmm0, %zmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_min_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %a = load <32 x half>, <32 x half>* %a_ptr + %tmp = fcmp fast olt <32 x half> %a, %b + %tmp4 = select <32 x i1> %tmp, <32 x half> %a, <32 x half> %b + ret <32 x half> %tmp4; +} + +define <16 x half> @test_max_v16f16(<16 x half> * %a_ptr, <16 x half> %b) { +; CHECK_UNSAFE-LABEL: test_max_v16f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vmaxph (%rdi), %ymm0, %ymm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_max_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vmaxph %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %a = load <16 x half>, <16 x half>* %a_ptr + %tmp = fcmp fast ogt <16 x half> %a, %b + %tmp4 = select <16 x i1> %tmp, <16 x half> %a, <16 x half> %b + ret <16 x half> %tmp4; +} + +define <16 x half> @test_min_v16f16(<16 x half>* %a_ptr, <16 x half> %b) { +; CHECK_UNSAFE-LABEL: test_min_v16f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vminph (%rdi), %ymm0, %ymm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_min_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vminph %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %a = load <16 x half>, <16 x half>* %a_ptr + %tmp = fcmp fast olt <16 x half> %a, %b + %tmp4 = select <16 x i1> %tmp, <16 x half> %a, <16 x half> %b + ret <16 x half> %tmp4; +} + +define <8 x half> @test_max_v8f16(<8 x half> * %a_ptr, <8 x half> %b) { +; CHECK_UNSAFE-LABEL: test_max_v8f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vmaxph (%rdi), %xmm0, %xmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_max_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: vmaxph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %a = load <8 x half>, <8 x half>* %a_ptr + %tmp = fcmp fast ogt <8 x half> %a, %b + %tmp4 = select <8 x i1> %tmp, <8 x half> %a, <8 x half> %b + ret <8 x half> %tmp4; +} + +define <8 x half> @test_min_v8f16(<8 x half>* %a_ptr, <8 x half> %b) { +; CHECK_UNSAFE-LABEL: test_min_v8f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vminph (%rdi), %xmm0, %xmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_min_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: vminph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %a = load <8 x half>, <8 x half>* %a_ptr + %tmp = fcmp fast olt <8 x half> %a, %b + %tmp4 = select <8 x i1> %tmp, <8 x half> %a, <8 x half> %b + ret <8 x half> %tmp4; +} + +define half @test_max_f16(half %a, half* %ptr) { +; CHECK_UNSAFE-LABEL: test_max_f16: +; CHECK_UNSAFE: # %bb.0: # %entry +; CHECK_UNSAFE-NEXT: vmaxsh (%rdi), %xmm0, %xmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_max_f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovsh (%rdi), %xmm1 +; CHECK-NEXT: vmaxsh %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = load half, half* %ptr + %1 = fcmp fast ogt half %0, %a + %2 = select i1 %1, half %0, half %a + ret half %2 +} + +define half @test_min_f16(half %a, half* %ptr) { +; CHECK_UNSAFE-LABEL: test_min_f16: +; CHECK_UNSAFE: # %bb.0: # %entry +; CHECK_UNSAFE-NEXT: vminsh (%rdi), %xmm0, %xmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_min_f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovsh (%rdi), %xmm1 +; CHECK-NEXT: vminsh %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = load half, half* %ptr + %1 = fcmp fast olt half %0, %a + %2 = select i1 %1, half %0, half %a + ret half %2 +} diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll @@ -0,0 +1,719 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-64 + +define i32 @test_f16_oeq_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_oeq_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_oeq_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %esi, %eax +; CHECK-64-NEXT: cmovpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ogt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ogt_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmoval %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ogt_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovbel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_oge_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_oge_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovael %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_oge_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovbl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_olt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_olt_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmoval %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_olt_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovbel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ole_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ole_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovael %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ole_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovbl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_one_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_one_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_one_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ord_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ord_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ord_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ueq_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ueq_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ueq_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ugt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ugt_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ugt_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovael %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_uge_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_uge_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_uge_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmoval %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ult_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ult_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ult_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovael %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ule_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ule_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ule_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmoval %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_une_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_une_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_une_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %esi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %edi, %eax +; CHECK-64-NEXT: cmovpl %edi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_uno_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_uno_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_uno_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_oeq_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_oeq_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_oeq_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %esi, %eax +; CHECK-64-NEXT: cmovpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ogt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ogt_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmoval %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ogt_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovbel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_oge_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_oge_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovael %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_oge_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovbl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_olt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_olt_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmoval %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_olt_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovbel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ole_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ole_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovael %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ole_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovbl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_one_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_one_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_one_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ord_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ord_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ord_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ueq_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ueq_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ueq_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ugt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ugt_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ugt_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovael %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_uge_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_uge_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_uge_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmoval %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ult_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ult_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ult_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovael %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ule_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ule_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ule_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmoval %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_une_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_une_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_une_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %esi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %edi, %eax +; CHECK-64-NEXT: cmovpl %edi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_uno_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_uno_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_uno_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define void @foo(half %0, half %1) #0 { +; CHECK-32-LABEL: foo: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: jbe .LBB28_1 +; CHECK-32-NEXT: # %bb.2: +; CHECK-32-NEXT: jmp bar@PLT # TAILCALL +; CHECK-32-NEXT: .LBB28_1: +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: foo: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: jbe .LBB28_1 +; CHECK-64-NEXT: # %bb.2: +; CHECK-64-NEXT: jmp bar@PLT # TAILCALL +; CHECK-64-NEXT: .LBB28_1: +; CHECK-64-NEXT: retq + %3 = call i1 @llvm.experimental.constrained.fcmp.f16( half %0, half %1, metadata !"ogt", metadata !"fpexcept.strict") #0 + br i1 %3, label %4, label %5 + +4: ; preds = %2 + tail call void @bar() #0 + br label %5 + +5: ; preds = %4, %2 + ret void +} +declare void @bar() + +attributes #0 = { strictfp } + +declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadata) +declare i1 @llvm.experimental.constrained.fcmps.f16(half, half, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64 + +declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata) + +define half @fadd_f16(half %a, half %b) nounwind strictfp { +; X86-LABEL: fadd_f16: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vaddsh {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: fadd_f16: +; X64: # %bb.0: +; X64-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %ret = call half @llvm.experimental.constrained.fadd.f16(half %a, half %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %ret +} + +define half @fsub_f16(half %a, half %b) nounwind strictfp { +; X86-LABEL: fsub_f16: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vsubsh {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: fsub_f16: +; X64: # %bb.0: +; X64-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %ret = call half @llvm.experimental.constrained.fsub.f16(half %a, half %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %ret +} + +define half @fmul_f16(half %a, half %b) nounwind strictfp { +; X86-LABEL: fmul_f16: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmulsh {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: fmul_f16: +; X64: # %bb.0: +; X64-NEXT: vmulsh %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %ret = call half @llvm.experimental.constrained.fmul.f16(half %a, half %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %ret +} + +define half @fdiv_f16(half %a, half %b) nounwind strictfp { +; X86-LABEL: fdiv_f16: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vdivsh {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: fdiv_f16: +; X64: # %bb.0: +; X64-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %ret = call half @llvm.experimental.constrained.fdiv.f16(half %a, half %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll --- a/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll +++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll @@ -1,5 +1,68 @@ ; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+avx512fp16 -mattr=+avx512vl -o - | FileCheck %s +; This test checks that only a single je gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; CHECK-LABEL: foo1: +; CHECK: je +; CHECK-NOT: je +define <8 x half> @foo1(i32 %v1, <8 x half> %v2, <8 x half> %v3, <8 x half> %v4) nounwind { +entry: + %cmp = icmp eq i32 %v1, 0 + %t1 = select i1 %cmp, <8 x half> %v2, <8 x half> %v3 + %t2 = select i1 %cmp, <8 x half> %v3, <8 x half> %v4 + %sub = fsub <8 x half> %t1, %t2 + ret <8 x half> %sub +} + +; This test checks that only a single ja gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. This combines +; all the supported types together into one long string of selects based +; on the same condition. +; CHECK-LABEL: foo2: +; CHECK: ja +; CHECK-NOT: ja +define void @foo2(i32 %v1, + half %v32, half %v33, + <8 x half> %v52, <8 x half> %v53, + <16 x half> %v122, <16 x half> %v123, + <32 x half> %v132, <32 x half> %v133, + i8 * %dst) nounwind { +entry: + %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 2 + %a31 = bitcast i8* %add.ptr31 to half* + + %add.ptr51 = getelementptr inbounds i8, i8* %dst, i32 4 + %a51 = bitcast i8* %add.ptr51 to <8 x half>* + + %add.ptr121 = getelementptr inbounds i8, i8* %dst, i32 20 + %a121 = bitcast i8* %add.ptr121 to <16 x half>* + + %add.ptr131 = getelementptr inbounds i8, i8* %dst, i32 52 + %a131 = bitcast i8* %add.ptr131 to <32 x half>* + + ; These operations are necessary, because select of two single use loads + ; ends up getting optimized into a select of two leas, followed by a + ; single load of the selected address. + + %t33 = fadd half %v33, %v32 + %t53 = fadd <8 x half> %v53, %v52 + %t123 = fadd <16 x half> %v123, %v122 + %t133 = fadd <32 x half> %v133, %v132 + + %cmp = icmp ugt i32 %v1, 31 + %t31 = select i1 %cmp, half %v32, half %t33 + %t51 = select i1 %cmp, <8 x half> %v52, <8 x half> %t53 + %t121 = select i1 %cmp, <16 x half> %v122, <16 x half> %t123 + %t131 = select i1 %cmp, <32 x half> %v132, <32 x half> %t133 + + store half %t31, half* %a31, align 2 + store <8 x half> %t51, <8 x half>* %a51, align 16 + store <16 x half> %t121, <16 x half>* %a121, align 32 + store <32 x half> %t131, <32 x half>* %a131, align 64 + + ret void +} + ; This test checks that only a single jne gets generated in the final code ; for lowering the CMOV pseudos that get created for this IR. define dso_local <32 x half> @foo3(<32 x half> %a, <32 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 { diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll @@ -0,0 +1,572 @@ +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; Stack reload folding tests. +; +; By including a nop call with sideeffects we can force a partial register spill of the +; relevant registers and check that the reload is correctly folded into the instruction. + +define <32 x half> @stack_fold_addph_zmm(<32 x half> %a0, <32 x half> %a1) { + ;CHECK-LABEL: stack_fold_addph_zmm + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <32 x half> %a0, %a1 + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_addph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_addph_zmm_k: + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <32 x half> %a0, %a1 + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_addph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_addph_zmm_k_commuted: + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <32 x half> %a1, %a0 + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_addph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) { + ;CHECK-LABEL: stack_fold_addph_zmm_kz + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <32 x half> %a1, %a0 + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define half @stack_fold_addsh(half %a0, half %a1) { + ;CHECK-LABEL: stack_fold_addsh + ;CHECK: vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd half %a0, %a1 + ret half %2 +} + +define <8 x half> @stack_fold_addsh_int(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_addsh_int + ;CHECK: vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = extractelement <8 x half> %a0, i32 0 + %3 = extractelement <8 x half> %a1, i32 0 + %4 = fadd half %2, %3 + %5 = insertelement <8 x half> %a0, half %4, i32 0 + ret <8 x half> %5 +} + +define i32 @stack_fold_cmpph(<32 x half> %a0, <32 x half> %a1) { + ;CHECK-LABEL: stack_fold_cmpph + ;CHECK: vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %res = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 0, <32 x i1> , i32 4) + %2 = bitcast <32 x i1> %res to i32 + ret i32 %2 +} +declare <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half>, <32 x half>, i32, <32 x i1>, i32) + +define <32 x half> @stack_fold_cmpph_mask(<32 x half> %a0, <32 x half> %a1, <32 x half>* %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) { + ;CHECK-LABEL: stack_fold_cmpph_mask: + ;CHECK: vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load + %2 = load <32 x half>, <32 x half>* %a2 + %3 = fadd <32 x half> %a1, %2 + %4 = bitcast i32 %mask to <32 x i1> + %5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %3, <32 x half> %a0, i32 0, <32 x i1> , i32 4) + %6 = and <32 x i1> %4, %5 + %7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1 + ret <32 x half> %7 +} + +define <32 x half> @stack_fold_cmpph_mask_commuted(<32 x half> %a0, <32 x half> %a1, <32 x half>* %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) { + ;CHECK-LABEL: stack_fold_cmpph_mask_commuted: + ;CHECK: vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load + %2 = load <32 x half>, <32 x half>* %a2 + %3 = fadd <32 x half> %a1, %2 + %4 = bitcast i32 %mask to <32 x i1> + %5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %3, i32 0, <32 x i1> , i32 4) + %6 = and <32 x i1> %4, %5 + %7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1 + ret <32 x half> %7 +} + +define half @stack_fold_divsh(half %a0, half %a1) { + ;CHECK-LABEL: stack_fold_divsh + ;CHECK: vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fdiv half %a0, %a1 + ret half %2 +} + +define <8 x half> @stack_fold_divsh_int(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_divsh_int + ;CHECK: vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = extractelement <8 x half> %a0, i32 0 + %3 = extractelement <8 x half> %a1, i32 0 + %4 = fdiv half %2, %3 + %5 = insertelement <8 x half> %a0, half %4, i32 0 + ret <8 x half> %5 +} + +define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + ret <32 x half> %2 +} +declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone + +define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commuted: + ;CHECK-NOT: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm_k: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm_k_commuted: + ;CHECK-NOT: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm_kz: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted: + ;CHECK-NOT: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_commuted: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_k: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_k_commuted: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz_commuted: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define half @stack_fold_maxsh(half %a0, half %a1) #0 { + ;CHECK-LABEL: stack_fold_maxsh: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp ogt half %a0, %a1 + %3 = select i1 %2, half %a0, half %a1 + ret half %3 +} + +define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 { + ;CHECK-LABEL: stack_fold_maxsh_commuted: + ;CHECK-NOT: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp ogt half %a1, %a0 + %3 = select i1 %2, half %a1, half %a0 + ret half %3 +} + +define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 { + ;CHECK-LABEL: stack_fold_maxsh_commutable: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp ogt half %a0, %a1 + %3 = select i1 %2, half %a0, half %a1 + ret half %3 +} + +define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 { + ;CHECK-LABEL: stack_fold_maxsh_commutable_commuted: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp ogt half %a1, %a0 + %3 = select i1 %2, half %a1, half %a0 + ret half %3 +} + +define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_maxsh_int: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @stack_fold_maxsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, <8 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_maxsh_mask: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_maxsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) { + ;CHECK-LABEL: stack_fold_maxsh_maskz: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4) + ret <8 x half> %2 +} + +define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + ret <32 x half> %2 +} +declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone + +define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm_commuted: + ;CHECK-NOT: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm_k: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm_k_commuted: + ;CHECK-NOT: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm_kz: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm_kz_commuted: + ;CHECK-NOT: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_minph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_minph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable_commuted: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_minph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable_k: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_minph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable_k_commuted: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_minph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable_kz: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable_kz_commuted: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define half @stack_fold_minsh(half %a0, half %a1) #0 { + ;CHECK-LABEL: stack_fold_minsh: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp olt half %a0, %a1 + %3 = select i1 %2, half %a0, half %a1 + ret half %3 +} + +define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 { + ;CHECK-LABEL: stack_fold_minsh_commuted: + ;CHECK-NOT: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp olt half %a1, %a0 + %3 = select i1 %2, half %a1, half %a0 + ret half %3 +} + +define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 { + ;CHECK-LABEL: stack_fold_minsh_commutable: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp olt half %a0, %a1 + %3 = select i1 %2, half %a0, half %a1 + ret half %3 +} + +define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 { + ;CHECK-LABEL: stack_fold_minsh_commutable_commuted: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp olt half %a1, %a0 + %3 = select i1 %2, half %a1, half %a0 + ret half %3 +} + +define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_minsh_int: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @stack_fold_minsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, <8 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_minsh_mask: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_minsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) { + ;CHECK-LABEL: stack_fold_minsh_maskz: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4) + ret <8 x half> %2 +} + +define <32 x half> @stack_fold_mulph_zmm(<32 x half> %a0, <32 x half> %a1) { + ;CHECK-LABEL: stack_fold_mulph_zmm + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <32 x half> %a0, %a1 + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_mulph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_mulph_zmm_k: + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <32 x half> %a0, %a1 + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_mulph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_mulph_zmm_k_commuted: + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <32 x half> %a1, %a0 + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_mulph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) { + ;CHECK-LABEL: stack_fold_mulph_zmm_kz + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <32 x half> %a1, %a0 + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define half @stack_fold_mulsh(half %a0, half %a1) { + ;CHECK-LABEL: stack_fold_mulsh + ;CHECK-NOT: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul half %a0, %a1 + ret half %2 +} + +define <8 x half> @stack_fold_mulsh_int(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_mulsh_int + ;CHECK-NOT: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = extractelement <8 x half> %a0, i32 0 + %3 = extractelement <8 x half> %a1, i32 0 + %4 = fmul half %2, %3 + %5 = insertelement <8 x half> %a0, half %4, i32 0 + ret <8 x half> %5 +} + +define <32 x half> @stack_fold_subph_zmm(<32 x half> %a0, <32 x half> %a1) { + ;CHECK-LABEL: stack_fold_subph_zmm + ;CHECK: vsubph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fsub <32 x half> %a0, %a1 + ret <32 x half> %2 +} + +define half @stack_fold_subsh(half %a0, half %a1) { + ;CHECK-LABEL: stack_fold_subsh + ;CHECK: vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fsub half %a0, %a1 + ret half %2 +} + +define <8 x half> @stack_fold_subsh_int(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_subsh_int + ;CHECK: vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = extractelement <8 x half> %a0, i32 0 + %3 = extractelement <8 x half> %a1, i32 0 + %4 = fsub half %2, %3 + %5 = insertelement <8 x half> %a0, half %4, i32 0 + ret <8 x half> %5 +} + +attributes #0 = { "unsafe-fp-math"="false" } +attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll @@ -0,0 +1,148 @@ +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512fp16 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; Stack reload folding tests. +; +; By including a nop call with sideeffects we can force a partial register spill of the +; relevant registers and check that the reload is correctly folded into the instruction. + +define <8 x half> @stack_fold_addph(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_addph + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <8 x half> %a0, %a1 + ret <8 x half> %2 +} + +define <16 x half> @stack_fold_addph_ymm(<16 x half> %a0, <16 x half> %a1) { + ;CHECK-LABEL: stack_fold_addph_ymm + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <16 x half> %a0, %a1 + ret <16 x half> %2 +} + +define i8 @stack_fold_cmpph(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_cmpph + ;CHECK: vcmpeqph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %res = call <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half> %a0, <8 x half> %a1, i32 0, <8 x i1> ) + %2 = bitcast <8 x i1> %res to i8 + ret i8 %2 +} +declare <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half>, <8 x half>, i32, <8 x i1>) + +define i16 @stack_fold_cmpph_ymm(<16 x half> %a0, <16 x half> %a1) { + ;CHECK-LABEL: stack_fold_cmpph_ymm + ;CHECK: vcmpeqph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %res = call <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half> %a0, <16 x half> %a1, i32 0, <16 x i1> ) + %2 = bitcast <16 x i1> %res to i16 + ret i16 %2 +} +declare <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half>, <16 x half>, i32, <16 x i1>) + +define <8 x half> @stack_fold_divph(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_divph + ;CHECK: vdivph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fdiv <8 x half> %a0, %a1 + ret <8 x half> %2 +} + +define <16 x half> @stack_fold_divph_ymm(<16 x half> %a0, <16 x half> %a1) { + ;CHECK-LABEL: stack_fold_divph_ymm + ;CHECK: vdivph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fdiv <16 x half> %a0, %a1 + ret <16 x half> %2 +} + +define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_maxph + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) nounwind readnone + +define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_maxph_commutable + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1) + ret <8 x half> %2 +} + +define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_maxph_ymm + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1) + ret <16 x half> %2 +} +declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) nounwind readnone + +define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_maxph_ymm_commutable + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1) + ret <16 x half> %2 +} + +define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_minph + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) nounwind readnone + +define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_minph_commutable + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1) + ret <8 x half> %2 +} + +define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_minph_ymm + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1) + ret <16 x half> %2 +} +declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) nounwind readnone + +define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_minph_ymm_commutable + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1) + ret <16 x half> %2 +} + +define <8 x half> @stack_fold_mulph(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_mulph + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <8 x half> %a0, %a1 + ret <8 x half> %2 +} + +define <16 x half> @stack_fold_mulph_ymm(<16 x half> %a0, <16 x half> %a1) { + ;CHECK-LABEL: stack_fold_mulph_ymm + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <16 x half> %a0, %a1 + ret <16 x half> %2 +} + +attributes #0 = { "unsafe-fp-math"="false" } +attributes #1 = { "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK + +declare <8 x half> @llvm.experimental.constrained.fadd.v8f16(<8 x half>, <8 x half>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.fsub.v8f16(<8 x half>, <8 x half>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.fmul.v8f16(<8 x half>, <8 x half>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.fdiv.v8f16(<8 x half>, <8 x half>, metadata, metadata) + +define <8 x half> @f2(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x half> @llvm.experimental.constrained.fadd.v8f16(<8 x half> %a, <8 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %ret +} + +define <8 x half> @f4(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x half> @llvm.experimental.constrained.fsub.v8f16(<8 x half> %a, <8 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %ret +} + +define <8 x half> @f6(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x half> @llvm.experimental.constrained.fmul.v8f16(<8 x half> %a, <8 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %ret +} + +define <8 x half> @f8(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x half> @llvm.experimental.constrained.fdiv.v8f16(<8 x half> %a, <8 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s + +declare <16 x half> @llvm.experimental.constrained.fadd.v16f16(<16 x half>, <16 x half>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.fsub.v16f16(<16 x half>, <16 x half>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.fmul.v16f16(<16 x half>, <16 x half>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.fdiv.v16f16(<16 x half>, <16 x half>, metadata, metadata) + +define <16 x half> @f2(<16 x half> %a, <16 x half> %b) #0 { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x half> @llvm.experimental.constrained.fadd.v16f16(<16 x half> %a, <16 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %ret +} + +define <16 x half> @f4(<16 x half> %a, <16 x half> %b) #0 { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x half> @llvm.experimental.constrained.fsub.v16f16(<16 x half> %a, <16 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %ret +} + +define <16 x half> @f6(<16 x half> %a, <16 x half> %b) #0 { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x half> @llvm.experimental.constrained.fmul.v16f16(<16 x half> %a, <16 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %ret +} + +define <16 x half> @f8(<16 x half> %a, <16 x half> %b) #0 { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x half> @llvm.experimental.constrained.fdiv.v16f16(<16 x half> %a, <16 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s + +declare <32 x half> @llvm.experimental.constrained.fadd.v32f16(<32 x half>, <32 x half>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.fsub.v32f16(<32 x half>, <32 x half>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.fmul.v32f16(<32 x half>, <32 x half>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.fdiv.v32f16(<32 x half>, <32 x half>, metadata, metadata) + +define <32 x half> @f2(<32 x half> %a, <32 x half> %b) #0 { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x half> @llvm.experimental.constrained.fadd.v32f16(<32 x half> %a, <32 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %ret +} + +define <32 x half> @f4(<32 x half> %a, <32 x half> %b) #0 { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x half> @llvm.experimental.constrained.fsub.v32f16(<32 x half> %a, <32 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %ret +} + +define <32 x half> @f6(<32 x half> %a, <32 x half> %b) #0 { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x half> @llvm.experimental.constrained.fmul.v32f16(<32 x half> %a, <32 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %ret +} + +define <32 x half> @f8(<32 x half> %a, <32 x half> %b) #0 { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x half> @llvm.experimental.constrained.fdiv.v32f16(<32 x half> %a, <32 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll @@ -0,0 +1,1012 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64 + +define <8 x i16> @test_v8f16_oeq_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_oeq_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpeqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_oeq_q: +; X64: # %bb.0: +; X64-NEXT: vcmpeqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ogt_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ogt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpgt_oqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ogt_q: +; X64: # %bb.0: +; X64-NEXT: vcmplt_oqph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_oge_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_oge_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpge_oqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_oge_q: +; X64: # %bb.0: +; X64-NEXT: vcmple_oqph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_olt_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_olt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmplt_oqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_olt_q: +; X64: # %bb.0: +; X64-NEXT: vcmplt_oqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ole_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ole_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmple_oqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ole_q: +; X64: # %bb.0: +; X64-NEXT: vcmple_oqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_one_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_one_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpneq_oqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_one_q: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_oqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ord_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ord_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpordph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ord_q: +; X64: # %bb.0: +; X64-NEXT: vcmpordph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ueq_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ueq_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpeq_uqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ueq_q: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_uqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ugt_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ugt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpnle_uqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ugt_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnle_uqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_uge_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_uge_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpnlt_uqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_uge_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnlt_uqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ult_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ult_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpnge_uqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ult_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnle_uqph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ule_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ule_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpngt_uqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ule_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnlt_uqph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_une_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_une_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpneqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_une_q: +; X64: # %bb.0: +; X64-NEXT: vcmpneqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_uno_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_uno_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpunordph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_uno_q: +; X64: # %bb.0: +; X64-NEXT: vcmpunordph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_oeq_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_oeq_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpeq_osph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_oeq_s: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_osph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ogt_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ogt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpgtph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ogt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpltph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_oge_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_oge_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpgeph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_oge_s: +; X64: # %bb.0: +; X64-NEXT: vcmpleph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_olt_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_olt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpltph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_olt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpltph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ole_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ole_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpleph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ole_s: +; X64: # %bb.0: +; X64-NEXT: vcmpleph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_one_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_one_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpneq_osph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_one_s: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_osph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ord_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ord_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpord_sph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ord_s: +; X64: # %bb.0: +; X64-NEXT: vcmpord_sph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ueq_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ueq_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpeq_usph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ueq_s: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_usph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ugt_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ugt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpnleph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ugt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnleph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_uge_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_uge_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpnltph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_uge_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnltph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ult_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ult_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpngeph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ult_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnleph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ule_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ule_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpngtph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ule_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnltph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_une_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_une_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpneq_usph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_une_s: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_usph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_uno_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_uno_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpunord_sph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_uno_s: +; X64: # %bb.0: +; X64-NEXT: vcmpunord_sph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <2 x i16> @test_v2f16_oeq_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, <2 x half> %f2) #0 { +; X86-LABEL: test_v2f16_oeq_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movb $-3, %al +; X86-NEXT: kmovd %eax, %k0 +; X86-NEXT: vucomish 8(%ebp), %xmm2 +; X86-NEXT: setnp %al +; X86-NEXT: sete %cl +; X86-NEXT: testb %al, %cl +; X86-NEXT: setne %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k0, %k1, %k0 +; X86-NEXT: vpsrld $16, %xmm2, %xmm2 +; X86-NEXT: vucomish 10(%ebp), %xmm2 +; X86-NEXT: setnp %al +; X86-NEXT: sete %cl +; X86-NEXT: testb %al, %cl +; X86-NEXT: setne %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $6, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v2f16_oeq_q: +; X64: # %bb.0: +; X64-NEXT: vucomish %xmm3, %xmm2 +; X64-NEXT: setnp %al +; X64-NEXT: sete %cl +; X64-NEXT: testb %al, %cl +; X64-NEXT: setne %al +; X64-NEXT: kmovd %eax, %k0 +; X64-NEXT: movb $-3, %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k1, %k0, %k0 +; X64-NEXT: vpsrld $16, %xmm3, %xmm3 +; X64-NEXT: vpsrld $16, %xmm2, %xmm2 +; X64-NEXT: vucomish %xmm3, %xmm2 +; X64-NEXT: setnp %al +; X64-NEXT: sete %cl +; X64-NEXT: testb %al, %cl +; X64-NEXT: setne %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $6, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f16( + <2 x half> %f1, <2 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <2 x i1> %cond, <2 x i16> %a, <2 x i16> %b + ret <2 x i16> %res +} + +define <2 x i16> @test_v2f16_ogt_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, <2 x half> %f2) #0 { +; X86-LABEL: test_v2f16_ogt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movb $-3, %al +; X86-NEXT: kmovd %eax, %k0 +; X86-NEXT: vcomish 8(%ebp), %xmm2 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k0, %k1, %k0 +; X86-NEXT: vpsrld $16, %xmm2, %xmm2 +; X86-NEXT: vcomish 10(%ebp), %xmm2 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $6, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v2f16_ogt_q: +; X64: # %bb.0: +; X64-NEXT: movb $-3, %al +; X64-NEXT: kmovd %eax, %k0 +; X64-NEXT: vcomish %xmm3, %xmm2 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k0, %k1, %k0 +; X64-NEXT: vpsrld $16, %xmm3, %xmm3 +; X64-NEXT: vpsrld $16, %xmm2, %xmm2 +; X64-NEXT: vcomish %xmm3, %xmm2 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $6, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f16( + <2 x half> %f1, <2 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <2 x i1> %cond, <2 x i16> %a, <2 x i16> %b + ret <2 x i16> %res +} + +define <4 x i16> @test_v4f16_oge_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, <4 x half> %f2) #0 { +; X86-LABEL: test_v4f16_oge_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movb $-3, %al +; X86-NEXT: kmovd %eax, %k0 +; X86-NEXT: vucomish 8(%ebp), %xmm2 +; X86-NEXT: setae %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k0, %k1, %k0 +; X86-NEXT: vpsrld $16, %xmm2, %xmm3 +; X86-NEXT: vucomish 10(%ebp), %xmm3 +; X86-NEXT: setae %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $6, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k0 +; X86-NEXT: movb $-5, %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k1, %k0, %k0 +; X86-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; X86-NEXT: vucomish 12(%ebp), %xmm3 +; X86-NEXT: setae %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $5, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k0 +; X86-NEXT: movb $-9, %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k1, %k0, %k0 +; X86-NEXT: vpsrlq $48, %xmm2, %xmm2 +; X86-NEXT: vucomish 14(%ebp), %xmm2 +; X86-NEXT: setae %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $4, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v4f16_oge_q: +; X64: # %bb.0: +; X64-NEXT: movb $-3, %al +; X64-NEXT: kmovd %eax, %k0 +; X64-NEXT: vucomish %xmm3, %xmm2 +; X64-NEXT: setae %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k0, %k1, %k0 +; X64-NEXT: vpsrld $16, %xmm3, %xmm4 +; X64-NEXT: vpsrld $16, %xmm2, %xmm5 +; X64-NEXT: vucomish %xmm4, %xmm5 +; X64-NEXT: setae %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $6, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k0 +; X64-NEXT: movb $-5, %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k1, %k0, %k0 +; X64-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; X64-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; X64-NEXT: vucomish %xmm4, %xmm5 +; X64-NEXT: setae %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $5, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k0 +; X64-NEXT: movb $-9, %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k1, %k0, %k0 +; X64-NEXT: vpsrlq $48, %xmm3, %xmm3 +; X64-NEXT: vpsrlq $48, %xmm2, %xmm2 +; X64-NEXT: vucomish %xmm3, %xmm2 +; X64-NEXT: setae %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $4, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f16( + <4 x half> %f1, <4 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <4 x i1> %cond, <4 x i16> %a, <4 x i16> %b + ret <4 x i16> %res +} + +define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, <4 x half> %f2) #0 { +; X86-LABEL: test_v4f16_olt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movb $-3, %al +; X86-NEXT: kmovd %eax, %k0 +; X86-NEXT: vmovsh 8(%ebp), %xmm3 +; X86-NEXT: vcomish %xmm2, %xmm3 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k0, %k1, %k0 +; X86-NEXT: vpsrld $16, %xmm2, %xmm3 +; X86-NEXT: vmovsh 10(%ebp), %xmm4 +; X86-NEXT: vcomish %xmm3, %xmm4 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $6, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k0 +; X86-NEXT: movb $-5, %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k1, %k0, %k0 +; X86-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; X86-NEXT: vmovsh 12(%ebp), %xmm4 +; X86-NEXT: vcomish %xmm3, %xmm4 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $5, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k0 +; X86-NEXT: movb $-9, %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k1, %k0, %k0 +; X86-NEXT: vpsrlq $48, %xmm2, %xmm2 +; X86-NEXT: vmovsh 14(%ebp), %xmm3 +; X86-NEXT: vcomish %xmm2, %xmm3 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $4, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v4f16_olt_q: +; X64: # %bb.0: +; X64-NEXT: movb $-3, %al +; X64-NEXT: kmovd %eax, %k0 +; X64-NEXT: vcomish %xmm2, %xmm3 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k0, %k1, %k0 +; X64-NEXT: vpsrld $16, %xmm2, %xmm4 +; X64-NEXT: vpsrld $16, %xmm3, %xmm5 +; X64-NEXT: vcomish %xmm4, %xmm5 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $6, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k0 +; X64-NEXT: movb $-5, %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k1, %k0, %k0 +; X64-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] +; X64-NEXT: vmovshdup {{.*#+}} xmm5 = xmm3[1,1,3,3] +; X64-NEXT: vcomish %xmm4, %xmm5 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $5, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k0 +; X64-NEXT: movb $-9, %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k1, %k0, %k0 +; X64-NEXT: vpsrlq $48, %xmm2, %xmm2 +; X64-NEXT: vpsrlq $48, %xmm3, %xmm3 +; X64-NEXT: vcomish %xmm2, %xmm3 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $4, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f16( + <4 x half> %f1, <4 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <4 x i1> %cond, <4 x i16> %a, <4 x i16> %b + ret <4 x i16> %res +} + +attributes #0 = { strictfp nounwind } + +declare <2 x i1> @llvm.experimental.constrained.fcmp.v2f16(<2 x half>, <2 x half>, metadata, metadata) +declare <2 x i1> @llvm.experimental.constrained.fcmps.v2f16(<2 x half>, <2 x half>, metadata, metadata) +declare <4 x i1> @llvm.experimental.constrained.fcmp.v4f16(<4 x half>, <4 x half>, metadata, metadata) +declare <4 x i1> @llvm.experimental.constrained.fcmps.v4f16(<4 x half>, <4 x half>, metadata, metadata) +declare <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(<8 x half>, <8 x half>, metadata, metadata) +declare <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(<8 x half>, <8 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-256-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-256-fp16.ll @@ -0,0 +1,708 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX512-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX512-64 + +define <16 x i16> @test_v16f16_oeq_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_oeq_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpeqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_oeq_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpeqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ogt_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ogt_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpgt_oqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ogt_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmplt_oqph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_oge_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_oge_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpge_oqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_oge_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmple_oqph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_olt_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_olt_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmplt_oqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_olt_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmplt_oqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ole_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ole_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmple_oqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ole_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmple_oqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_one_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_one_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpneq_oqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_one_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpneq_oqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ord_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ord_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpordph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ord_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpordph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ueq_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ueq_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpeq_uqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ueq_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpeq_uqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ugt_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ugt_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpnle_uqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ugt_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnle_uqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_uge_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_uge_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpnlt_uqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_uge_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnlt_uqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ult_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ult_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpnge_uqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ult_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnle_uqph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ule_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ule_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpngt_uqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ule_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnlt_uqph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_une_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_une_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpneqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_une_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpneqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_uno_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_uno_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpunordph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_uno_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpunordph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_oeq_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_oeq_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpeq_osph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_oeq_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpeq_osph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ogt_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ogt_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpgtph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ogt_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpltph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_oge_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_oge_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpgeph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_oge_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpleph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_olt_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_olt_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpltph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_olt_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpltph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ole_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ole_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpleph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ole_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpleph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_one_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_one_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpneq_osph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_one_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpneq_osph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ord_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ord_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpord_sph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ord_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpord_sph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ueq_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ueq_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpeq_usph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ueq_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpeq_usph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ugt_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ugt_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpnleph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ugt_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnleph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_uge_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_uge_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpnltph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_uge_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnltph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ult_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ult_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpngeph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ult_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnleph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ule_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ule_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpngtph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ule_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnltph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_une_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_une_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpneq_usph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_une_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpneq_usph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_uno_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_uno_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpunord_sph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_uno_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpunord_sph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +attributes #0 = { strictfp nounwind } + +declare <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(<16 x half>, <16 x half>, metadata, metadata) +declare <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(<16 x half>, <16 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-512-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-512-fp16.ll @@ -0,0 +1,708 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X64 + +define <32 x i16> @test_v32f16_oeq_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_oeq_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpeqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_oeq_q: +; X64: # %bb.0: +; X64-NEXT: vcmpeqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ogt_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ogt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpgt_oqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ogt_q: +; X64: # %bb.0: +; X64-NEXT: vcmplt_oqph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_oge_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_oge_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpge_oqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_oge_q: +; X64: # %bb.0: +; X64-NEXT: vcmple_oqph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_olt_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_olt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmplt_oqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_olt_q: +; X64: # %bb.0: +; X64-NEXT: vcmplt_oqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ole_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ole_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmple_oqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ole_q: +; X64: # %bb.0: +; X64-NEXT: vcmple_oqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_one_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_one_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpneq_oqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_one_q: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_oqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ord_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ord_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpordph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ord_q: +; X64: # %bb.0: +; X64-NEXT: vcmpordph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ueq_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ueq_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpeq_uqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ueq_q: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_uqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ugt_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ugt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpnle_uqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ugt_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnle_uqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_uge_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_uge_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpnlt_uqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_uge_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnlt_uqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ult_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ult_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpnge_uqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ult_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnle_uqph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ule_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ule_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpngt_uqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ule_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnlt_uqph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_une_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_une_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpneqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_une_q: +; X64: # %bb.0: +; X64-NEXT: vcmpneqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_uno_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_uno_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpunordph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_uno_q: +; X64: # %bb.0: +; X64-NEXT: vcmpunordph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_oeq_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_oeq_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpeq_osph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_oeq_s: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_osph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ogt_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ogt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpgtph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ogt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpltph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_oge_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_oge_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpgeph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_oge_s: +; X64: # %bb.0: +; X64-NEXT: vcmpleph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_olt_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_olt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpltph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_olt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpltph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ole_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ole_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpleph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ole_s: +; X64: # %bb.0: +; X64-NEXT: vcmpleph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_one_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_one_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpneq_osph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_one_s: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_osph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ord_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ord_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpord_sph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ord_s: +; X64: # %bb.0: +; X64-NEXT: vcmpord_sph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ueq_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ueq_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpeq_usph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ueq_s: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_usph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ugt_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ugt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpnleph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ugt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnleph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_uge_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_uge_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpnltph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_uge_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnltph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ult_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ult_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpngeph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ult_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnleph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ule_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ule_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpngtph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ule_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnltph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_une_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_une_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpneq_usph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_une_s: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_usph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_uno_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_uno_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpunord_sph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_uno_s: +; X64: # %bb.0: +; X64-NEXT: vcmpunord_sph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +attributes #0 = { strictfp nounwind } + +declare <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(<32 x half>, <32 x half>, metadata, metadata) +declare <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(<32 x half>, <32 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -3,8 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512FP16 ; ; vXf32 @@ -416,21 +417,29 @@ ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v2f16: -; AVX512: # %bb.0: -; AVX512-NEXT: movzwl %si, %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: movzwl %di, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm0, %xmm1 -; AVX512-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: cmoval %edi, %esi -; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v2f16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movzwl %si, %eax +; AVX512BW-NEXT: vmovd %eax, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: movzwl %di, %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vucomiss %xmm0, %xmm1 +; AVX512BW-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; AVX512BW-NEXT: cmoval %edi, %esi +; AVX512BW-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; AVX512BW-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: retq +; +; AVX512FP16-LABEL: test_v2f16: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512FP16-NEXT: vcmpltph %xmm0, %xmm1, %k1 +; AVX512FP16-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} +; AVX512FP16-NEXT: vmovaps %xmm1, %xmm0 +; AVX512FP16-NEXT: retq %1 = call nnan half @llvm.vector.reduce.fmax.v2f16(<2 x half> %a0) ret half %1 } diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -3,8 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512FP16 ; ; vXf32 @@ -415,21 +416,29 @@ ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v2f16: -; AVX512: # %bb.0: -; AVX512-NEXT: movzwl %si, %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: movzwl %di, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm0, %xmm1 -; AVX512-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: cmovbl %edi, %esi -; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v2f16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movzwl %si, %eax +; AVX512BW-NEXT: vmovd %eax, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: movzwl %di, %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vucomiss %xmm0, %xmm1 +; AVX512BW-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; AVX512BW-NEXT: cmovbl %edi, %esi +; AVX512BW-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; AVX512BW-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: retq +; +; AVX512FP16-LABEL: test_v2f16: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512FP16-NEXT: vcmpltph %xmm1, %xmm0, %k1 +; AVX512FP16-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} +; AVX512FP16-NEXT: vmovaps %xmm1, %xmm0 +; AVX512FP16-NEXT: retq %1 = call nnan half @llvm.vector.reduce.fmin.v2f16(<2 x half> %a0) ret half %1 } diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt --- a/llvm/test/MC/Disassembler/X86/avx512fp16.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt @@ -76,3 +76,387 @@ # ATT: vmovw %xmm30, -256(%rdx) # INTEL: vmovw word ptr [rdx - 256], xmm30 0x62,0x65,0x7d,0x08,0x7e,0x72,0x80 + +# ATT: vaddph %zmm28, %zmm29, %zmm30 +# INTEL: vaddph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x58,0xf4 + +# ATT: vaddph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vaddph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x05,0x14,0x10,0x58,0xf4 + +# ATT: vaddph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vaddph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vaddph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vaddph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x58,0x31 + +# ATT: vaddph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vaddph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x58,0x71,0x7f + +# ATT: vaddph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vaddph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x58,0x72,0x80 + +# ATT: vaddsh %xmm28, %xmm29, %xmm30 +# INTEL: vaddsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x58,0xf4 + +# ATT: vaddsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vaddsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x05,0x16,0x10,0x58,0xf4 + +# ATT: vaddsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vaddsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vaddsh (%r9), %xmm29, %xmm30 +# INTEL: vaddsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x58,0x31 + +# ATT: vaddsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vaddsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x58,0x71,0x7f + +# ATT: vaddsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vaddsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x58,0x72,0x80 + +# ATT: vcmpeqph %zmm28, %zmm29, %k5 +# INTEL: vcmpph k5, zmm29, zmm28, 0 +0x62,0x93,0x14,0x40,0xc2,0xec,0x00 + +# ATT: vcmpleph {sae}, %zmm28, %zmm29, %k5 +# INTEL: vcmpph k5, zmm29, zmm28, {sae}, 2 +0x62,0x93,0x14,0x10,0xc2,0xec,0x02 + +# ATT: vcmpneqph 268435456(%rbp,%r14,8), %zmm29, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456], 4 +0x62,0xb3,0x14,0x47,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x04 + +# ATT: vcmpnleph (%r9){1to32}, %zmm29, %k5 +# INTEL: vcmpph k5, zmm29, word ptr [r9]{1to32}, 6 +0x62,0xd3,0x14,0x50,0xc2,0x29,0x06 + +# ATT: vcmpeq_uqph 8128(%rcx), %zmm29, %k5 +# INTEL: vcmpph k5, zmm29, zmmword ptr [rcx + 8128], 8 +0x62,0xf3,0x14,0x40,0xc2,0x69,0x7f,0x08 + +# ATT: vcmpngtph -256(%rdx){1to32}, %zmm29, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, zmm29, word ptr [rdx - 256]{1to32}, 10 +0x62,0xf3,0x14,0x57,0xc2,0x6a,0x80,0x0a + +# ATT: vcmpneq_oqsh %xmm28, %xmm29, %k5 +# INTEL: vcmpsh k5, xmm29, xmm28, 12 +0x62,0x93,0x16,0x00,0xc2,0xec,0x0c + +# ATT: vcmpgtsh {sae}, %xmm28, %xmm29, %k5 +# INTEL: vcmpsh k5, xmm29, xmm28, {sae}, 14 +0x62,0x93,0x16,0x10,0xc2,0xec,0x0e + +# ATT: vcmpeq_ossh 268435456(%rbp,%r14,8), %xmm29, %k5 {%k7} +# INTEL: vcmpsh k5 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 16 +0x62,0xb3,0x16,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x10 + +# ATT: vcmple_oqsh (%r9), %xmm29, %k5 +# INTEL: vcmpsh k5, xmm29, word ptr [r9], 18 +0x62,0xd3,0x16,0x00,0xc2,0x29,0x12 + +# ATT: vcmpneq_ussh 254(%rcx), %xmm29, %k5 +# INTEL: vcmpsh k5, xmm29, word ptr [rcx + 254], 20 +0x62,0xf3,0x16,0x00,0xc2,0x69,0x7f,0x14 + +# ATT: vcmpnle_uqsh -256(%rdx), %xmm29, %k5 {%k7} +# INTEL: vcmpsh k5 {k7}, xmm29, word ptr [rdx - 256], 22 +0x62,0xf3,0x16,0x07,0xc2,0x6a,0x80,0x16 + +# ATT: vcomish %xmm29, %xmm30 +# INTEL: vcomish xmm30, xmm29 +0x62,0x05,0x7c,0x08,0x2f,0xf5 + +# ATT: vcomish {sae}, %xmm29, %xmm30 +# INTEL: vcomish xmm30, xmm29, {sae} +0x62,0x05,0x7c,0x18,0x2f,0xf5 + +# ATT: vcomish 268435456(%rbp,%r14,8), %xmm30 +# INTEL: vcomish xmm30, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7c,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcomish (%r9), %xmm30 +# INTEL: vcomish xmm30, word ptr [r9] +0x62,0x45,0x7c,0x08,0x2f,0x31 + +# ATT: vcomish 254(%rcx), %xmm30 +# INTEL: vcomish xmm30, word ptr [rcx + 254] +0x62,0x65,0x7c,0x08,0x2f,0x71,0x7f + +# ATT: vcomish -256(%rdx), %xmm30 +# INTEL: vcomish xmm30, word ptr [rdx - 256] +0x62,0x65,0x7c,0x08,0x2f,0x72,0x80 + +# ATT: vdivph %zmm28, %zmm29, %zmm30 +# INTEL: vdivph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x5e,0xf4 + +# ATT: vdivph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vdivph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x05,0x14,0x10,0x5e,0xf4 + +# ATT: vdivph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vdivph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vdivph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vdivph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x5e,0x31 + +# ATT: vdivph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vdivph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x5e,0x71,0x7f + +# ATT: vdivph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vdivph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x5e,0x72,0x80 + +# ATT: vdivsh %xmm28, %xmm29, %xmm30 +# INTEL: vdivsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x5e,0xf4 + +# ATT: vdivsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vdivsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x05,0x16,0x10,0x5e,0xf4 + +# ATT: vdivsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vdivsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vdivsh (%r9), %xmm29, %xmm30 +# INTEL: vdivsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x5e,0x31 + +# ATT: vdivsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vdivsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x5e,0x71,0x7f + +# ATT: vdivsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vdivsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x5e,0x72,0x80 + +# ATT: vmaxph %zmm28, %zmm29, %zmm30 +# INTEL: vmaxph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x5f,0xf4 + +# ATT: vmaxph {sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vmaxph zmm30, zmm29, zmm28, {sae} +0x62,0x05,0x14,0x10,0x5f,0xf4 + +# ATT: vmaxph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vmaxph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmaxph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vmaxph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x5f,0x31 + +# ATT: vmaxph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vmaxph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x5f,0x71,0x7f + +# ATT: vmaxph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vmaxph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x5f,0x72,0x80 + +# ATT: vmaxsh %xmm28, %xmm29, %xmm30 +# INTEL: vmaxsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x5f,0xf4 + +# ATT: vmaxsh {sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vmaxsh xmm30, xmm29, xmm28, {sae} +0x62,0x05,0x16,0x10,0x5f,0xf4 + +# ATT: vmaxsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vmaxsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmaxsh (%r9), %xmm29, %xmm30 +# INTEL: vmaxsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x5f,0x31 + +# ATT: vmaxsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vmaxsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x5f,0x71,0x7f + +# ATT: vmaxsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vmaxsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x5f,0x72,0x80 + +# ATT: vminph %zmm28, %zmm29, %zmm30 +# INTEL: vminph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x5d,0xf4 + +# ATT: vminph {sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vminph zmm30, zmm29, zmm28, {sae} +0x62,0x05,0x14,0x10,0x5d,0xf4 + +# ATT: vminph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vminph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vminph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vminph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x5d,0x31 + +# ATT: vminph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vminph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x5d,0x71,0x7f + +# ATT: vminph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vminph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x5d,0x72,0x80 + +# ATT: vminsh %xmm28, %xmm29, %xmm30 +# INTEL: vminsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x5d,0xf4 + +# ATT: vminsh {sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vminsh xmm30, xmm29, xmm28, {sae} +0x62,0x05,0x16,0x10,0x5d,0xf4 + +# ATT: vminsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vminsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vminsh (%r9), %xmm29, %xmm30 +# INTEL: vminsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x5d,0x31 + +# ATT: vminsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vminsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x5d,0x71,0x7f + +# ATT: vminsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vminsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x5d,0x72,0x80 + +# ATT: vmulph %zmm28, %zmm29, %zmm30 +# INTEL: vmulph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x59,0xf4 + +# ATT: vmulph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vmulph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x05,0x14,0x10,0x59,0xf4 + +# ATT: vmulph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vmulph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmulph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vmulph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x59,0x31 + +# ATT: vmulph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vmulph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x59,0x71,0x7f + +# ATT: vmulph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vmulph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x59,0x72,0x80 + +# ATT: vmulsh %xmm28, %xmm29, %xmm30 +# INTEL: vmulsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x59,0xf4 + +# ATT: vmulsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vmulsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x05,0x16,0x10,0x59,0xf4 + +# ATT: vmulsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vmulsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmulsh (%r9), %xmm29, %xmm30 +# INTEL: vmulsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x59,0x31 + +# ATT: vmulsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vmulsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x59,0x71,0x7f + +# ATT: vmulsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vmulsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x59,0x72,0x80 + +# ATT: vsubph %zmm28, %zmm29, %zmm30 +# INTEL: vsubph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x5c,0xf4 + +# ATT: vsubph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vsubph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x05,0x14,0x10,0x5c,0xf4 + +# ATT: vsubph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vsubph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsubph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vsubph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x5c,0x31 + +# ATT: vsubph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vsubph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x5c,0x71,0x7f + +# ATT: vsubph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vsubph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x5c,0x72,0x80 + +# ATT: vsubsh %xmm28, %xmm29, %xmm30 +# INTEL: vsubsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x5c,0xf4 + +# ATT: vsubsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vsubsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x05,0x16,0x10,0x5c,0xf4 + +# ATT: vsubsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vsubsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsubsh (%r9), %xmm29, %xmm30 +# INTEL: vsubsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x5c,0x31 + +# ATT: vsubsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vsubsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x5c,0x71,0x7f + +# ATT: vsubsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vsubsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x5c,0x72,0x80 + +# ATT: vucomish %xmm29, %xmm30 +# INTEL: vucomish xmm30, xmm29 +0x62,0x05,0x7c,0x08,0x2e,0xf5 + +# ATT: vucomish {sae}, %xmm29, %xmm30 +# INTEL: vucomish xmm30, xmm29, {sae} +0x62,0x05,0x7c,0x18,0x2e,0xf5 + +# ATT: vucomish 268435456(%rbp,%r14,8), %xmm30 +# INTEL: vucomish xmm30, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7c,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vucomish (%r9), %xmm30 +# INTEL: vucomish xmm30, word ptr [r9] +0x62,0x45,0x7c,0x08,0x2e,0x31 + +# ATT: vucomish 254(%rcx), %xmm30 +# INTEL: vucomish xmm30, word ptr [rcx + 254] +0x62,0x65,0x7c,0x08,0x2e,0x71,0x7f + +# ATT: vucomish -256(%rdx), %xmm30 +# INTEL: vucomish xmm30, word ptr [rdx - 256] +0x62,0x65,0x7c,0x08,0x2e,0x72,0x80 diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt @@ -0,0 +1,282 @@ +# RUN: llvm-mc --disassemble %s -triple=i686 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vaddph %ymm4, %ymm5, %ymm6 +# INTEL: vaddph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x58,0xf4 + +# ATT: vaddph %xmm4, %xmm5, %xmm6 +# INTEL: vaddph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x58,0xf4 + +# ATT: vaddph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vaddph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vaddph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vaddph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x58,0x31 + +# ATT: vaddph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vaddph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x58,0x71,0x7f + +# ATT: vaddph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vaddph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x58,0x72,0x80 + +# ATT: vaddph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vaddph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vaddph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vaddph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x58,0x31 + +# ATT: vaddph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vaddph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x58,0x71,0x7f + +# ATT: vaddph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vaddph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x58,0x72,0x80 + +# ATT: vcmpltph %ymm4, %ymm5, %k5 +# INTEL: vcmpph k5, ymm5, ymm4, 1 +0x62,0xf3,0x54,0x28,0xc2,0xec,0x01 + +# ATT: vcmpunordph %xmm4, %xmm5, %k5 +# INTEL: vcmpph k5, xmm5, xmm4, 3 +0x62,0xf3,0x54,0x08,0xc2,0xec,0x03 + +# ATT: vcmpnltph 268435456(%esp,%esi,8), %xmm5, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456], 5 +0x62,0xf3,0x54,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x05 + +# ATT: vcmpordph (%ecx){1to8}, %xmm5, %k5 +# INTEL: vcmpph k5, xmm5, word ptr [ecx]{1to8}, 7 +0x62,0xf3,0x54,0x18,0xc2,0x29,0x07 + +# ATT: vcmpngeph 2032(%ecx), %xmm5, %k5 +# INTEL: vcmpph k5, xmm5, xmmword ptr [ecx + 2032], 9 +0x62,0xf3,0x54,0x08,0xc2,0x69,0x7f,0x09 + +# ATT: vcmpfalseph -256(%edx){1to8}, %xmm5, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, xmm5, word ptr [edx - 256]{1to8}, 11 +0x62,0xf3,0x54,0x1f,0xc2,0x6a,0x80,0x0b + +# ATT: vcmpgeph 268435456(%esp,%esi,8), %ymm5, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456], 13 +0x62,0xf3,0x54,0x2f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x0d + +# ATT: vcmptrueph (%ecx){1to16}, %ymm5, %k5 +# INTEL: vcmpph k5, ymm5, word ptr [ecx]{1to16}, 15 +0x62,0xf3,0x54,0x38,0xc2,0x29,0x0f + +# ATT: vcmplt_oqph 4064(%ecx), %ymm5, %k5 +# INTEL: vcmpph k5, ymm5, ymmword ptr [ecx + 4064], 17 +0x62,0xf3,0x54,0x28,0xc2,0x69,0x7f,0x11 + +# ATT: vcmpunord_sph -256(%edx){1to16}, %ymm5, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, ymm5, word ptr [edx - 256]{1to16}, 19 +0x62,0xf3,0x54,0x3f,0xc2,0x6a,0x80,0x13 + +# ATT: vdivph %ymm4, %ymm5, %ymm6 +# INTEL: vdivph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x5e,0xf4 + +# ATT: vdivph %xmm4, %xmm5, %xmm6 +# INTEL: vdivph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x5e,0xf4 + +# ATT: vdivph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vdivph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vdivph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vdivph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x5e,0x31 + +# ATT: vdivph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vdivph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x5e,0x71,0x7f + +# ATT: vdivph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vdivph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x5e,0x72,0x80 + +# ATT: vdivph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vdivph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vdivph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vdivph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x5e,0x31 + +# ATT: vdivph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vdivph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x5e,0x71,0x7f + +# ATT: vdivph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vdivph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x5e,0x72,0x80 + +# ATT: vmaxph %ymm4, %ymm5, %ymm6 +# INTEL: vmaxph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x5f,0xf4 + +# ATT: vmaxph %xmm4, %xmm5, %xmm6 +# INTEL: vmaxph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x5f,0xf4 + +# ATT: vmaxph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vmaxph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmaxph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vmaxph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x5f,0x31 + +# ATT: vmaxph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vmaxph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x5f,0x71,0x7f + +# ATT: vmaxph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vmaxph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x5f,0x72,0x80 + +# ATT: vmaxph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vmaxph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmaxph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vmaxph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x5f,0x31 + +# ATT: vmaxph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vmaxph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x5f,0x71,0x7f + +# ATT: vmaxph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vmaxph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x5f,0x72,0x80 + +# ATT: vminph %ymm4, %ymm5, %ymm6 +# INTEL: vminph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x5d,0xf4 + +# ATT: vminph %xmm4, %xmm5, %xmm6 +# INTEL: vminph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x5d,0xf4 + +# ATT: vminph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vminph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vminph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vminph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x5d,0x31 + +# ATT: vminph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vminph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x5d,0x71,0x7f + +# ATT: vminph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vminph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x5d,0x72,0x80 + +# ATT: vminph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vminph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vminph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vminph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x5d,0x31 + +# ATT: vminph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vminph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x5d,0x71,0x7f + +# ATT: vminph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vminph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x5d,0x72,0x80 + +# ATT: vmulph %ymm4, %ymm5, %ymm6 +# INTEL: vmulph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x59,0xf4 + +# ATT: vmulph %xmm4, %xmm5, %xmm6 +# INTEL: vmulph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x59,0xf4 + +# ATT: vmulph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vmulph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmulph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vmulph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x59,0x31 + +# ATT: vmulph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vmulph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x59,0x71,0x7f + +# ATT: vmulph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vmulph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x59,0x72,0x80 + +# ATT: vmulph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vmulph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmulph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vmulph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x59,0x31 + +# ATT: vmulph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vmulph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x59,0x71,0x7f + +# ATT: vmulph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vmulph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x59,0x72,0x80 + +# ATT: vsubph %ymm4, %ymm5, %ymm6 +# INTEL: vsubph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x5c,0xf4 + +# ATT: vsubph %xmm4, %xmm5, %xmm6 +# INTEL: vsubph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x5c,0xf4 + +# ATT: vsubph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vsubph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsubph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vsubph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x5c,0x31 + +# ATT: vsubph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vsubph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x5c,0x71,0x7f + +# ATT: vsubph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vsubph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x5c,0x72,0x80 + +# ATT: vsubph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vsubph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsubph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vsubph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x5c,0x31 + +# ATT: vsubph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vsubph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x5c,0x71,0x7f + +# ATT: vsubph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vsubph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x5c,0x72,0x80 diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s --- a/llvm/test/MC/X86/avx512fp16.s +++ b/llvm/test/MC/X86/avx512fp16.s @@ -75,3 +75,387 @@ // CHECK: vmovw %xmm30, -256(%rdx) // CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7e,0x72,0x80] vmovw %xmm30, -256(%rdx) + +// CHECK: vaddph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x58,0xf4] + vaddph %zmm28, %zmm29, %zmm30 + +// CHECK: vaddph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x58,0xf4] + vaddph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vaddph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vaddph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x58,0x31] + vaddph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vaddph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x58,0x71,0x7f] + vaddph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vaddph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x58,0x72,0x80] + vaddph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vaddsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x58,0xf4] + vaddsh %xmm28, %xmm29, %xmm30 + +// CHECK: vaddsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x58,0xf4] + vaddsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vaddsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vaddsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x58,0x31] + vaddsh (%r9), %xmm29, %xmm30 + +// CHECK: vaddsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x58,0x71,0x7f] + vaddsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vaddsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x58,0x72,0x80] + vaddsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vcmpneq_usph %zmm28, %zmm29, %k5 +// CHECK: encoding: [0x62,0x93,0x14,0x40,0xc2,0xec,0x14] + vcmpneq_usph %zmm28, %zmm29, %k5 + +// CHECK: vcmpnlt_uqph {sae}, %zmm28, %zmm29, %k5 +// CHECK: encoding: [0x62,0x93,0x14,0x10,0xc2,0xec,0x15] + vcmpnlt_uqph {sae}, %zmm28, %zmm29, %k5 + +// CHECK: vcmpnle_uqph 268435456(%rbp,%r14,8), %zmm29, %k5 {%k7} +// CHECK: encoding: [0x62,0xb3,0x14,0x47,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x16] + vcmpnle_uqph 268435456(%rbp,%r14,8), %zmm29, %k5 {%k7} + +// CHECK: vcmpord_sph (%r9){1to32}, %zmm29, %k5 +// CHECK: encoding: [0x62,0xd3,0x14,0x50,0xc2,0x29,0x17] + vcmpord_sph (%r9){1to32}, %zmm29, %k5 + +// CHECK: vcmpeq_usph 8128(%rcx), %zmm29, %k5 +// CHECK: encoding: [0x62,0xf3,0x14,0x40,0xc2,0x69,0x7f,0x18] + vcmpeq_usph 8128(%rcx), %zmm29, %k5 + +// CHECK: vcmpnge_uqph -256(%rdx){1to32}, %zmm29, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x14,0x57,0xc2,0x6a,0x80,0x19] + vcmpnge_uqph -256(%rdx){1to32}, %zmm29, %k5 {%k7} + +// CHECK: vcmpngt_uqsh %xmm28, %xmm29, %k5 +// CHECK: encoding: [0x62,0x93,0x16,0x00,0xc2,0xec,0x1a] + vcmpngt_uqsh %xmm28, %xmm29, %k5 + +// CHECK: vcmpfalse_ossh {sae}, %xmm28, %xmm29, %k5 +// CHECK: encoding: [0x62,0x93,0x16,0x10,0xc2,0xec,0x1b] + vcmpfalse_ossh {sae}, %xmm28, %xmm29, %k5 + +// CHECK: vcmpneq_ossh 268435456(%rbp,%r14,8), %xmm29, %k5 {%k7} +// CHECK: encoding: [0x62,0xb3,0x16,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x1c] + vcmpneq_ossh 268435456(%rbp,%r14,8), %xmm29, %k5 {%k7} + +// CHECK: vcmpge_oqsh (%r9), %xmm29, %k5 +// CHECK: encoding: [0x62,0xd3,0x16,0x00,0xc2,0x29,0x1d] + vcmpge_oqsh (%r9), %xmm29, %k5 + +// CHECK: vcmpgt_oqsh 254(%rcx), %xmm29, %k5 +// CHECK: encoding: [0x62,0xf3,0x16,0x00,0xc2,0x69,0x7f,0x1e] + vcmpgt_oqsh 254(%rcx), %xmm29, %k5 + +// CHECK: vcmptrue_ussh -256(%rdx), %xmm29, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x16,0x07,0xc2,0x6a,0x80,0x1f] + vcmptrue_ussh -256(%rdx), %xmm29, %k5 {%k7} + +// CHECK: vcomish %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x2f,0xf5] + vcomish %xmm29, %xmm30 + +// CHECK: vcomish {sae}, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x2f,0xf5] + vcomish {sae}, %xmm29, %xmm30 + +// CHECK: vcomish 268435456(%rbp,%r14,8), %xmm30 +// CHECK: encoding: [0x62,0x25,0x7c,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcomish 268435456(%rbp,%r14,8), %xmm30 + +// CHECK: vcomish (%r9), %xmm30 +// CHECK: encoding: [0x62,0x45,0x7c,0x08,0x2f,0x31] + vcomish (%r9), %xmm30 + +// CHECK: vcomish 254(%rcx), %xmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2f,0x71,0x7f] + vcomish 254(%rcx), %xmm30 + +// CHECK: vcomish -256(%rdx), %xmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2f,0x72,0x80] + vcomish -256(%rdx), %xmm30 + +// CHECK: vdivph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5e,0xf4] + vdivph %zmm28, %zmm29, %zmm30 + +// CHECK: vdivph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5e,0xf4] + vdivph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vdivph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vdivph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5e,0x31] + vdivph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vdivph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5e,0x71,0x7f] + vdivph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vdivph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5e,0x72,0x80] + vdivph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vdivsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5e,0xf4] + vdivsh %xmm28, %xmm29, %xmm30 + +// CHECK: vdivsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5e,0xf4] + vdivsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vdivsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vdivsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5e,0x31] + vdivsh (%r9), %xmm29, %xmm30 + +// CHECK: vdivsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5e,0x71,0x7f] + vdivsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vdivsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5e,0x72,0x80] + vdivsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vmaxph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5f,0xf4] + vmaxph %zmm28, %zmm29, %zmm30 + +// CHECK: vmaxph {sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5f,0xf4] + vmaxph {sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vmaxph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vmaxph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5f,0x31] + vmaxph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vmaxph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5f,0x71,0x7f] + vmaxph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vmaxph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5f,0x72,0x80] + vmaxph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vmaxsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5f,0xf4] + vmaxsh %xmm28, %xmm29, %xmm30 + +// CHECK: vmaxsh {sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5f,0xf4] + vmaxsh {sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vmaxsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vmaxsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5f,0x31] + vmaxsh (%r9), %xmm29, %xmm30 + +// CHECK: vmaxsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5f,0x71,0x7f] + vmaxsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vmaxsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5f,0x72,0x80] + vmaxsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vminph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5d,0xf4] + vminph %zmm28, %zmm29, %zmm30 + +// CHECK: vminph {sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5d,0xf4] + vminph {sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vminph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vminph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5d,0x31] + vminph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vminph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5d,0x71,0x7f] + vminph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vminph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5d,0x72,0x80] + vminph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vminsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5d,0xf4] + vminsh %xmm28, %xmm29, %xmm30 + +// CHECK: vminsh {sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5d,0xf4] + vminsh {sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vminsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vminsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5d,0x31] + vminsh (%r9), %xmm29, %xmm30 + +// CHECK: vminsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5d,0x71,0x7f] + vminsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vminsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5d,0x72,0x80] + vminsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vmulph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x59,0xf4] + vmulph %zmm28, %zmm29, %zmm30 + +// CHECK: vmulph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x59,0xf4] + vmulph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vmulph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vmulph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x59,0x31] + vmulph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vmulph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x59,0x71,0x7f] + vmulph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vmulph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x59,0x72,0x80] + vmulph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vmulsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x59,0xf4] + vmulsh %xmm28, %xmm29, %xmm30 + +// CHECK: vmulsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x59,0xf4] + vmulsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vmulsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vmulsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x59,0x31] + vmulsh (%r9), %xmm29, %xmm30 + +// CHECK: vmulsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x59,0x71,0x7f] + vmulsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vmulsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x59,0x72,0x80] + vmulsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vsubph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5c,0xf4] + vsubph %zmm28, %zmm29, %zmm30 + +// CHECK: vsubph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5c,0xf4] + vsubph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vsubph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vsubph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5c,0x31] + vsubph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vsubph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5c,0x71,0x7f] + vsubph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vsubph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5c,0x72,0x80] + vsubph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vsubsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5c,0xf4] + vsubsh %xmm28, %xmm29, %xmm30 + +// CHECK: vsubsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5c,0xf4] + vsubsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vsubsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vsubsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5c,0x31] + vsubsh (%r9), %xmm29, %xmm30 + +// CHECK: vsubsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5c,0x71,0x7f] + vsubsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vsubsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5c,0x72,0x80] + vsubsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vucomish %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x2e,0xf5] + vucomish %xmm29, %xmm30 + +// CHECK: vucomish {sae}, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x2e,0xf5] + vucomish {sae}, %xmm29, %xmm30 + +// CHECK: vucomish 268435456(%rbp,%r14,8), %xmm30 +// CHECK: encoding: [0x62,0x25,0x7c,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vucomish 268435456(%rbp,%r14,8), %xmm30 + +// CHECK: vucomish (%r9), %xmm30 +// CHECK: encoding: [0x62,0x45,0x7c,0x08,0x2e,0x31] + vucomish (%r9), %xmm30 + +// CHECK: vucomish 254(%rcx), %xmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2e,0x71,0x7f] + vucomish 254(%rcx), %xmm30 + +// CHECK: vucomish -256(%rdx), %xmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2e,0x72,0x80] + vucomish -256(%rdx), %xmm30 diff --git a/llvm/test/MC/X86/avx512fp16vl.s b/llvm/test/MC/X86/avx512fp16vl.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx512fp16vl.s @@ -0,0 +1,281 @@ +// RUN: llvm-mc -triple i686-unknown-unknown --show-encoding < %s | FileCheck %s + +// CHECK: vaddph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x58,0xf4] + vaddph %ymm4, %ymm5, %ymm6 + +// CHECK: vaddph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x58,0xf4] + vaddph %xmm4, %xmm5, %xmm6 + +// CHECK: vaddph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10] + vaddph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vaddph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x58,0x31] + vaddph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vaddph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x58,0x71,0x7f] + vaddph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vaddph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x58,0x72,0x80] + vaddph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vaddph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10] + vaddph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vaddph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x58,0x31] + vaddph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vaddph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x58,0x71,0x7f] + vaddph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vaddph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x58,0x72,0x80] + vaddph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vcmpeqph %ymm4, %ymm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x28,0xc2,0xec,0x00] + vcmpph $0, %ymm4, %ymm5, %k5 + +// CHECK: vcmpltph %xmm4, %xmm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0xc2,0xec,0x01] + vcmpph $1, %xmm4, %xmm5, %k5 + +// CHECK: vcmpleph 268435456(%esp,%esi,8), %xmm5, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x02] + vcmpph $2, 268435456(%esp,%esi,8), %xmm5, %k5 {%k7} + +// CHECK: vcmpunordph (%ecx){1to8}, %xmm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x18,0xc2,0x29,0x03] + vcmpph $3, (%ecx){1to8}, %xmm5, %k5 + +// CHECK: vcmpneqph 2032(%ecx), %xmm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0xc2,0x69,0x7f,0x04] + vcmpph $4, 2032(%ecx), %xmm5, %k5 + +// CHECK: vcmpnltph -256(%edx){1to8}, %xmm5, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x54,0x1f,0xc2,0x6a,0x80,0x05] + vcmpph $5, -256(%edx){1to8}, %xmm5, %k5 {%k7} + +// CHECK: vcmpnleph 268435456(%esp,%esi,8), %ymm5, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x54,0x2f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x06] + vcmpph $6, 268435456(%esp,%esi,8), %ymm5, %k5 {%k7} + +// CHECK: vcmpordph (%ecx){1to16}, %ymm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x38,0xc2,0x29,0x07] + vcmpph $7, (%ecx){1to16}, %ymm5, %k5 + +// CHECK: vcmpeq_uqph 4064(%ecx), %ymm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x28,0xc2,0x69,0x7f,0x08] + vcmpph $8, 4064(%ecx), %ymm5, %k5 + +// CHECK: vcmpngeph -256(%edx){1to16}, %ymm5, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x54,0x3f,0xc2,0x6a,0x80,0x09] + vcmpph $9, -256(%edx){1to16}, %ymm5, %k5 {%k7} + +// CHECK: vdivph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5e,0xf4] + vdivph %ymm4, %ymm5, %ymm6 + +// CHECK: vdivph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5e,0xf4] + vdivph %xmm4, %xmm5, %xmm6 + +// CHECK: vdivph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vdivph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vdivph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5e,0x31] + vdivph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vdivph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5e,0x71,0x7f] + vdivph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vdivph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5e,0x72,0x80] + vdivph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vdivph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vdivph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vdivph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5e,0x31] + vdivph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vdivph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5e,0x71,0x7f] + vdivph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vdivph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5e,0x72,0x80] + vdivph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vmaxph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5f,0xf4] + vmaxph %ymm4, %ymm5, %ymm6 + +// CHECK: vmaxph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5f,0xf4] + vmaxph %xmm4, %xmm5, %xmm6 + +// CHECK: vmaxph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmaxph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vmaxph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5f,0x31] + vmaxph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vmaxph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5f,0x71,0x7f] + vmaxph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vmaxph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5f,0x72,0x80] + vmaxph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vmaxph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmaxph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vmaxph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5f,0x31] + vmaxph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vmaxph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5f,0x71,0x7f] + vmaxph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vmaxph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5f,0x72,0x80] + vmaxph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vminph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5d,0xf4] + vminph %ymm4, %ymm5, %ymm6 + +// CHECK: vminph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5d,0xf4] + vminph %xmm4, %xmm5, %xmm6 + +// CHECK: vminph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vminph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vminph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5d,0x31] + vminph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vminph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5d,0x71,0x7f] + vminph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vminph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5d,0x72,0x80] + vminph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vminph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vminph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vminph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5d,0x31] + vminph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vminph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5d,0x71,0x7f] + vminph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vminph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5d,0x72,0x80] + vminph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vmulph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x59,0xf4] + vmulph %ymm4, %ymm5, %ymm6 + +// CHECK: vmulph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x59,0xf4] + vmulph %xmm4, %xmm5, %xmm6 + +// CHECK: vmulph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmulph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vmulph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x59,0x31] + vmulph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vmulph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x59,0x71,0x7f] + vmulph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vmulph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x59,0x72,0x80] + vmulph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vmulph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmulph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vmulph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x59,0x31] + vmulph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vmulph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x59,0x71,0x7f] + vmulph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vmulph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x59,0x72,0x80] + vmulph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vsubph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5c,0xf4] + vsubph %ymm4, %ymm5, %ymm6 + +// CHECK: vsubph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5c,0xf4] + vsubph %xmm4, %xmm5, %xmm6 + +// CHECK: vsubph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsubph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vsubph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5c,0x31] + vsubph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vsubph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5c,0x71,0x7f] + vsubph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vsubph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5c,0x72,0x80] + vsubph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vsubph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsubph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vsubph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5c,0x31] + vsubph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vsubph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5c,0x71,0x7f] + vsubph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vsubph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5c,0x72,0x80] + vsubph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s --- a/llvm/test/MC/X86/intel-syntax-avx512fp16.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s @@ -75,3 +75,387 @@ // CHECK: vmovw word ptr [edx - 256], xmm6 // CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x72,0x80] vmovw word ptr [edx - 256], xmm6 + +// CHECK: vaddph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x58,0xf4] + vaddph zmm6, zmm5, zmm4 + +// CHECK: vaddph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x58,0xf4] + vaddph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vaddph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10] + vaddph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vaddph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x58,0x31] + vaddph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vaddph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x58,0x71,0x7f] + vaddph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vaddph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x58,0x72,0x80] + vaddph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vaddsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x58,0xf4] + vaddsh xmm6, xmm5, xmm4 + +// CHECK: vaddsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x58,0xf4] + vaddsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vaddsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10] + vaddsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vaddsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x58,0x31] + vaddsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vaddsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x58,0x71,0x7f] + vaddsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vaddsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x58,0x72,0x80] + vaddsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vcmpph k5, zmm5, zmm4, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x48,0xc2,0xec,0x7b] + vcmpph k5, zmm5, zmm4, 123 + +// CHECK: vcmpph k5, zmm5, zmm4, {sae}, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x18,0xc2,0xec,0x7b] + vcmpph k5, zmm5, zmm4, {sae}, 123 + +// CHECK: vcmpph k5 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x4f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vcmpph k5 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vcmpph k5, zmm5, word ptr [ecx]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x58,0xc2,0x29,0x7b] + vcmpph k5, zmm5, word ptr [ecx]{1to32}, 123 + +// CHECK: vcmpph k5, zmm5, zmmword ptr [ecx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x48,0xc2,0x69,0x7f,0x7b] + vcmpph k5, zmm5, zmmword ptr [ecx + 8128], 123 + +// CHECK: vcmpph k5 {k7}, zmm5, word ptr [edx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x5f,0xc2,0x6a,0x80,0x7b] + vcmpph k5 {k7}, zmm5, word ptr [edx - 256]{1to32}, 123 + +// CHECK: vcmpsh k5, xmm5, xmm4, 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0xec,0x7b] + vcmpsh k5, xmm5, xmm4, 123 + +// CHECK: vcmpsh k5, xmm5, xmm4, {sae}, 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x18,0xc2,0xec,0x7b] + vcmpsh k5, xmm5, xmm4, {sae}, 123 + +// CHECK: vcmpsh k5 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vcmpsh k5 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vcmpsh k5, xmm5, word ptr [ecx], 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0x29,0x7b] + vcmpsh k5, xmm5, word ptr [ecx], 123 + +// CHECK: vcmpsh k5, xmm5, word ptr [ecx + 254], 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0x69,0x7f,0x7b] + vcmpsh k5, xmm5, word ptr [ecx + 254], 123 + +// CHECK: vcmpsh k5 {k7}, xmm5, word ptr [edx - 256], 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x0f,0xc2,0x6a,0x80,0x7b] + vcmpsh k5 {k7}, xmm5, word ptr [edx - 256], 123 + +// CHECK: vcomish xmm6, xmm5 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0xf5] + vcomish xmm6, xmm5 + +// CHECK: vcomish xmm6, xmm5, {sae} +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x2f,0xf5] + vcomish xmm6, xmm5, {sae} + +// CHECK: vcomish xmm6, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcomish xmm6, word ptr [esp + 8*esi + 268435456] + +// CHECK: vcomish xmm6, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0x31] + vcomish xmm6, word ptr [ecx] + +// CHECK: vcomish xmm6, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0x71,0x7f] + vcomish xmm6, word ptr [ecx + 254] + +// CHECK: vcomish xmm6, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0x72,0x80] + vcomish xmm6, word ptr [edx - 256] + +// CHECK: vdivph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5e,0xf4] + vdivph zmm6, zmm5, zmm4 + +// CHECK: vdivph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5e,0xf4] + vdivph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vdivph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vdivph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vdivph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5e,0x31] + vdivph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vdivph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5e,0x71,0x7f] + vdivph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vdivph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5e,0x72,0x80] + vdivph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vdivsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5e,0xf4] + vdivsh xmm6, xmm5, xmm4 + +// CHECK: vdivsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5e,0xf4] + vdivsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vdivsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vdivsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vdivsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5e,0x31] + vdivsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vdivsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5e,0x71,0x7f] + vdivsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vdivsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5e,0x72,0x80] + vdivsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vmaxph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5f,0xf4] + vmaxph zmm6, zmm5, zmm4 + +// CHECK: vmaxph zmm6, zmm5, zmm4, {sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5f,0xf4] + vmaxph zmm6, zmm5, zmm4, {sae} + +// CHECK: vmaxph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmaxph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vmaxph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5f,0x31] + vmaxph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vmaxph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5f,0x71,0x7f] + vmaxph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vmaxph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5f,0x72,0x80] + vmaxph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vmaxsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5f,0xf4] + vmaxsh xmm6, xmm5, xmm4 + +// CHECK: vmaxsh xmm6, xmm5, xmm4, {sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5f,0xf4] + vmaxsh xmm6, xmm5, xmm4, {sae} + +// CHECK: vmaxsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmaxsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vmaxsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5f,0x31] + vmaxsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vmaxsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5f,0x71,0x7f] + vmaxsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vmaxsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5f,0x72,0x80] + vmaxsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vminph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5d,0xf4] + vminph zmm6, zmm5, zmm4 + +// CHECK: vminph zmm6, zmm5, zmm4, {sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5d,0xf4] + vminph zmm6, zmm5, zmm4, {sae} + +// CHECK: vminph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vminph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vminph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5d,0x31] + vminph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vminph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5d,0x71,0x7f] + vminph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vminph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5d,0x72,0x80] + vminph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vminsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5d,0xf4] + vminsh xmm6, xmm5, xmm4 + +// CHECK: vminsh xmm6, xmm5, xmm4, {sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5d,0xf4] + vminsh xmm6, xmm5, xmm4, {sae} + +// CHECK: vminsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vminsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vminsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5d,0x31] + vminsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vminsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5d,0x71,0x7f] + vminsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vminsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5d,0x72,0x80] + vminsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vmulph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x59,0xf4] + vmulph zmm6, zmm5, zmm4 + +// CHECK: vmulph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x59,0xf4] + vmulph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vmulph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmulph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vmulph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x59,0x31] + vmulph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vmulph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x59,0x71,0x7f] + vmulph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vmulph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x59,0x72,0x80] + vmulph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vmulsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x59,0xf4] + vmulsh xmm6, xmm5, xmm4 + +// CHECK: vmulsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x59,0xf4] + vmulsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vmulsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmulsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vmulsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x59,0x31] + vmulsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vmulsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x59,0x71,0x7f] + vmulsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vmulsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x59,0x72,0x80] + vmulsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vsubph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5c,0xf4] + vsubph zmm6, zmm5, zmm4 + +// CHECK: vsubph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5c,0xf4] + vsubph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vsubph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsubph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vsubph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5c,0x31] + vsubph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vsubph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5c,0x71,0x7f] + vsubph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vsubph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5c,0x72,0x80] + vsubph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vsubsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5c,0xf4] + vsubsh xmm6, xmm5, xmm4 + +// CHECK: vsubsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5c,0xf4] + vsubsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vsubsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsubsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vsubsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5c,0x31] + vsubsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vsubsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5c,0x71,0x7f] + vsubsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vsubsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5c,0x72,0x80] + vsubsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vucomish xmm6, xmm5 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0xf5] + vucomish xmm6, xmm5 + +// CHECK: vucomish xmm6, xmm5, {sae} +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x2e,0xf5] + vucomish xmm6, xmm5, {sae} + +// CHECK: vucomish xmm6, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vucomish xmm6, word ptr [esp + 8*esi + 268435456] + +// CHECK: vucomish xmm6, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0x31] + vucomish xmm6, word ptr [ecx] + +// CHECK: vucomish xmm6, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0x71,0x7f] + vucomish xmm6, word ptr [ecx + 254] + +// CHECK: vucomish xmm6, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0x72,0x80] + vucomish xmm6, word ptr [edx - 256] diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s @@ -0,0 +1,281 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vaddph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x58,0xf4] + vaddph ymm30, ymm29, ymm28 + +// CHECK: vaddph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x58,0xf4] + vaddph xmm30, xmm29, xmm28 + +// CHECK: vaddph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vaddph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x58,0x31] + vaddph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vaddph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x58,0x71,0x7f] + vaddph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vaddph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x58,0x72,0x80] + vaddph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vaddph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vaddph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x58,0x31] + vaddph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vaddph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x58,0x71,0x7f] + vaddph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vaddph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x58,0x72,0x80] + vaddph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vcmpph k5, ymm29, ymm28, 123 +// CHECK: encoding: [0x62,0x93,0x14,0x20,0xc2,0xec,0x7b] + vcmpph k5, ymm29, ymm28, 123 + +// CHECK: vcmpph k5, xmm29, xmm28, 123 +// CHECK: encoding: [0x62,0x93,0x14,0x00,0xc2,0xec,0x7b] + vcmpph k5, xmm29, xmm28, 123 + +// CHECK: vcmpph k5 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xb3,0x14,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vcmpph k5 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vcmpph k5, xmm29, word ptr [r9]{1to8}, 123 +// CHECK: encoding: [0x62,0xd3,0x14,0x10,0xc2,0x29,0x7b] + vcmpph k5, xmm29, word ptr [r9]{1to8}, 123 + +// CHECK: vcmpph k5, xmm29, xmmword ptr [rcx + 2032], 123 +// CHECK: encoding: [0x62,0xf3,0x14,0x00,0xc2,0x69,0x7f,0x7b] + vcmpph k5, xmm29, xmmword ptr [rcx + 2032], 123 + +// CHECK: vcmpph k5 {k7}, xmm29, word ptr [rdx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x14,0x17,0xc2,0x6a,0x80,0x7b] + vcmpph k5 {k7}, xmm29, word ptr [rdx - 256]{1to8}, 123 + +// CHECK: vcmpph k5 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xb3,0x14,0x27,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vcmpph k5 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vcmpph k5, ymm29, word ptr [r9]{1to16}, 123 +// CHECK: encoding: [0x62,0xd3,0x14,0x30,0xc2,0x29,0x7b] + vcmpph k5, ymm29, word ptr [r9]{1to16}, 123 + +// CHECK: vcmpph k5, ymm29, ymmword ptr [rcx + 4064], 123 +// CHECK: encoding: [0x62,0xf3,0x14,0x20,0xc2,0x69,0x7f,0x7b] + vcmpph k5, ymm29, ymmword ptr [rcx + 4064], 123 + +// CHECK: vcmpph k5 {k7}, ymm29, word ptr [rdx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x14,0x37,0xc2,0x6a,0x80,0x7b] + vcmpph k5 {k7}, ymm29, word ptr [rdx - 256]{1to16}, 123 + +// CHECK: vdivph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5e,0xf4] + vdivph ymm30, ymm29, ymm28 + +// CHECK: vdivph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5e,0xf4] + vdivph xmm30, xmm29, xmm28 + +// CHECK: vdivph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vdivph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5e,0x31] + vdivph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vdivph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5e,0x71,0x7f] + vdivph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vdivph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5e,0x72,0x80] + vdivph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vdivph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vdivph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5e,0x31] + vdivph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vdivph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5e,0x71,0x7f] + vdivph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vdivph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5e,0x72,0x80] + vdivph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vmaxph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5f,0xf4] + vmaxph ymm30, ymm29, ymm28 + +// CHECK: vmaxph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5f,0xf4] + vmaxph xmm30, xmm29, xmm28 + +// CHECK: vmaxph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmaxph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5f,0x31] + vmaxph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vmaxph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5f,0x71,0x7f] + vmaxph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vmaxph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5f,0x72,0x80] + vmaxph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vmaxph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmaxph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5f,0x31] + vmaxph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vmaxph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5f,0x71,0x7f] + vmaxph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vmaxph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5f,0x72,0x80] + vmaxph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vminph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5d,0xf4] + vminph ymm30, ymm29, ymm28 + +// CHECK: vminph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5d,0xf4] + vminph xmm30, xmm29, xmm28 + +// CHECK: vminph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vminph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5d,0x31] + vminph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vminph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5d,0x71,0x7f] + vminph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vminph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5d,0x72,0x80] + vminph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vminph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vminph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5d,0x31] + vminph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vminph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5d,0x71,0x7f] + vminph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vminph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5d,0x72,0x80] + vminph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vmulph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x59,0xf4] + vmulph ymm30, ymm29, ymm28 + +// CHECK: vmulph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x59,0xf4] + vmulph xmm30, xmm29, xmm28 + +// CHECK: vmulph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmulph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x59,0x31] + vmulph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vmulph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x59,0x71,0x7f] + vmulph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vmulph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x59,0x72,0x80] + vmulph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vmulph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmulph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x59,0x31] + vmulph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vmulph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x59,0x71,0x7f] + vmulph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vmulph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x59,0x72,0x80] + vmulph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vsubph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5c,0xf4] + vsubph ymm30, ymm29, ymm28 + +// CHECK: vsubph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5c,0xf4] + vsubph xmm30, xmm29, xmm28 + +// CHECK: vsubph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsubph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5c,0x31] + vsubph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vsubph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5c,0x71,0x7f] + vsubph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vsubph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5c,0x72,0x80] + vsubph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vsubph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsubph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5c,0x31] + vsubph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vsubph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5c,0x71,0x7f] + vsubph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vsubph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5c,0x72,0x80] + vsubph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}