diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1995,6 +1995,25 @@ TARGET_BUILTIN(__builtin_ia32_vcvtps2phx256_mask, "V8xV8fV8xUc", "ncV:256:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_vcvtps2phx512_mask, "V16xV16fV16xUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddph, "V8xV8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfmaddph256, "V16xV16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfmaddph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddph512_maskz, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubph, "V8xV8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256, "V16xV16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_maskz, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16") + +TARGET_BUILTIN(__builtin_ia32_vfmsubaddph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmsubph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16") + +TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_maskz, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmsubsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") + // generic select intrinsics TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl") TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -12050,6 +12050,22 @@ Intrinsic::ID IID = Intrinsic::not_intrinsic; switch (BuiltinID) { default: break; + case clang::X86::BI__builtin_ia32_vfmsubph512_mask3: + Subtract = true; + LLVM_FALLTHROUGH; + case clang::X86::BI__builtin_ia32_vfmaddph512_mask: + case clang::X86::BI__builtin_ia32_vfmaddph512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddph512_mask3: + IID = llvm::Intrinsic::x86_avx512fp16_vfmadd_ph_512; + break; + case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3: + Subtract = true; + LLVM_FALLTHROUGH; + case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask: + case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3: + IID = llvm::Intrinsic::x86_avx512fp16_vfmaddsub_ph_512; + break; case clang::X86::BI__builtin_ia32_vfmsubps512_mask3: Subtract = true; LLVM_FALLTHROUGH; @@ -12113,22 +12129,30 @@ // Handle any required masking. Value *MaskFalseVal = nullptr; switch (BuiltinID) { + case clang::X86::BI__builtin_ia32_vfmaddph512_mask: case clang::X86::BI__builtin_ia32_vfmaddps512_mask: case clang::X86::BI__builtin_ia32_vfmaddpd512_mask: + case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask: case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask: case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask: MaskFalseVal = Ops[0]; break; + case clang::X86::BI__builtin_ia32_vfmaddph512_maskz: case clang::X86::BI__builtin_ia32_vfmaddps512_maskz: case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz: case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz: case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz: MaskFalseVal = Constant::getNullValue(Ops[0]->getType()); break; + case clang::X86::BI__builtin_ia32_vfmsubph512_mask3: + case clang::X86::BI__builtin_ia32_vfmaddph512_mask3: case clang::X86::BI__builtin_ia32_vfmsubps512_mask3: case clang::X86::BI__builtin_ia32_vfmaddps512_mask3: case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3: case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3: + case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3: + case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3: case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3: case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3: case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3: @@ -12159,9 +12183,21 @@ Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0); Value *Res; if (Rnd != 4) { - Intrinsic::ID IID = Ops[0]->getType()->getPrimitiveSizeInBits() == 32 ? - Intrinsic::x86_avx512_vfmadd_f32 : - Intrinsic::x86_avx512_vfmadd_f64; + Intrinsic::ID IID; + + switch (Ops[0]->getType()->getPrimitiveSizeInBits()) { + case 16: + IID = Intrinsic::x86_avx512fp16_vfmadd_f16; + break; + case 32: + IID = Intrinsic::x86_avx512_vfmadd_f32; + break; + case 64: + IID = Intrinsic::x86_avx512_vfmadd_f64; + break; + default: + llvm_unreachable("Unexpected size"); + } Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID), {Ops[0], Ops[1], Ops[2], Ops[4]}); } else if (CGF.Builder.getIsFPConstrained()) { @@ -12764,6 +12800,7 @@ case X86::BI__builtin_ia32_vfmaddss3: case X86::BI__builtin_ia32_vfmaddsd3: + case X86::BI__builtin_ia32_vfmaddsh3_mask: case X86::BI__builtin_ia32_vfmaddss3_mask: case X86::BI__builtin_ia32_vfmaddsd3_mask: return EmitScalarFMAExpr(*this, E, Ops, Ops[0]); @@ -12771,20 +12808,28 @@ case X86::BI__builtin_ia32_vfmaddsd: return EmitScalarFMAExpr(*this, E, Ops, Constant::getNullValue(Ops[0]->getType())); + case X86::BI__builtin_ia32_vfmaddsh3_maskz: case X86::BI__builtin_ia32_vfmaddss3_maskz: case X86::BI__builtin_ia32_vfmaddsd3_maskz: return EmitScalarFMAExpr(*this, E, Ops, Ops[0], /*ZeroMask*/ true); + case X86::BI__builtin_ia32_vfmaddsh3_mask3: case X86::BI__builtin_ia32_vfmaddss3_mask3: case X86::BI__builtin_ia32_vfmaddsd3_mask3: return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2); + case X86::BI__builtin_ia32_vfmsubsh3_mask3: case X86::BI__builtin_ia32_vfmsubss3_mask3: case X86::BI__builtin_ia32_vfmsubsd3_mask3: return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2, /*NegAcc*/ true); + case X86::BI__builtin_ia32_vfmaddph: case X86::BI__builtin_ia32_vfmaddps: case X86::BI__builtin_ia32_vfmaddpd: + case X86::BI__builtin_ia32_vfmaddph256: case X86::BI__builtin_ia32_vfmaddps256: case X86::BI__builtin_ia32_vfmaddpd256: + case X86::BI__builtin_ia32_vfmaddph512_mask: + case X86::BI__builtin_ia32_vfmaddph512_maskz: + case X86::BI__builtin_ia32_vfmaddph512_mask3: case X86::BI__builtin_ia32_vfmaddps512_mask: case X86::BI__builtin_ia32_vfmaddps512_maskz: case X86::BI__builtin_ia32_vfmaddps512_mask3: @@ -12793,7 +12838,12 @@ case X86::BI__builtin_ia32_vfmaddpd512_maskz: case X86::BI__builtin_ia32_vfmaddpd512_mask3: case X86::BI__builtin_ia32_vfmsubpd512_mask3: + case X86::BI__builtin_ia32_vfmsubph512_mask3: return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ false); + case X86::BI__builtin_ia32_vfmaddsubph512_mask: + case X86::BI__builtin_ia32_vfmaddsubph512_maskz: + case X86::BI__builtin_ia32_vfmaddsubph512_mask3: + case X86::BI__builtin_ia32_vfmsubaddph512_mask3: case X86::BI__builtin_ia32_vfmaddsubps512_mask: case X86::BI__builtin_ia32_vfmaddsubps512_maskz: case X86::BI__builtin_ia32_vfmaddsubps512_mask3: diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -2423,6 +2423,492 @@ _MM_FROUND_CUR_DIRECTION); } +#define _mm512_fmadd_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask3( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_fmsub_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_fnmadd_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask3( \ + -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ + -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_fnmsub_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ + -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, + -(__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, + -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_maskz( + (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, + (__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, + -(__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_maskz( + -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fmaddsub_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_fmsubadd_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_mask( + (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_mask( + (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmaddsubph512_mask3( + (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_maskz( + (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_mask( + (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_mask( + (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_maskz( + (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmsubph512_mask3( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmsubaddph512_mask3( + (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmsubph512_mask3( \ + -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, + -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W, + __m128h __A, + __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B, + (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmadd_round_sh(A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \ + (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, + -(__v8hf)__B, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, + -(__v8hf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmsub_round_sh(A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, + -(__v8hf)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ + (__mmask8)(U), (int)R)) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \ + ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \ + (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W, + __m128h __A, + __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B, + (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fnmadd_round_sh(A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ + (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \ + (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W, + __m128h __A, + __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B, + (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fnmsub_round_sh(A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ + (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \ + ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \ + (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ + (__mmask8)(U), (int)(R))) + static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_reduce_add_ph(__m512h __W) { return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W); diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -1371,6 +1371,378 @@ (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); } +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A, + __mmask8 __U, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, + -(__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A, + __mmask8 __U, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, + -(__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, + -(__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, + -(__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, + -(__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, + -(__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), + (__v16hf)__C); +} + static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -4084,6 +4084,9 @@ case X86::BI__builtin_ia32_vfmaddss3_mask: case X86::BI__builtin_ia32_vfmaddss3_maskz: case X86::BI__builtin_ia32_vfmaddss3_mask3: + case X86::BI__builtin_ia32_vfmaddsh3_mask: + case X86::BI__builtin_ia32_vfmaddsh3_maskz: + case X86::BI__builtin_ia32_vfmaddsh3_mask3: case X86::BI__builtin_ia32_vfmaddpd512_mask: case X86::BI__builtin_ia32_vfmaddpd512_maskz: case X86::BI__builtin_ia32_vfmaddpd512_mask3: @@ -4092,6 +4095,10 @@ case X86::BI__builtin_ia32_vfmaddps512_maskz: case X86::BI__builtin_ia32_vfmaddps512_mask3: case X86::BI__builtin_ia32_vfmsubps512_mask3: + case X86::BI__builtin_ia32_vfmaddph512_mask: + case X86::BI__builtin_ia32_vfmaddph512_maskz: + case X86::BI__builtin_ia32_vfmaddph512_mask3: + case X86::BI__builtin_ia32_vfmsubph512_mask3: case X86::BI__builtin_ia32_vfmaddsubpd512_mask: case X86::BI__builtin_ia32_vfmaddsubpd512_maskz: case X86::BI__builtin_ia32_vfmaddsubpd512_mask3: @@ -4100,6 +4107,10 @@ case X86::BI__builtin_ia32_vfmaddsubps512_maskz: case X86::BI__builtin_ia32_vfmaddsubps512_mask3: case X86::BI__builtin_ia32_vfmsubaddps512_mask3: + case X86::BI__builtin_ia32_vfmaddsubph512_mask: + case X86::BI__builtin_ia32_vfmaddsubph512_maskz: + case X86::BI__builtin_ia32_vfmaddsubph512_mask3: + case X86::BI__builtin_ia32_vfmsubaddph512_mask3: ArgNum = 4; HasRC = true; break; diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c --- a/clang/test/CodeGen/X86/avx512fp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -3163,6 +3163,839 @@ return _mm512_maskz_cvtxps_ph(A, B); } +__m512h test_mm512_fmadd_round_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fmadd_round_ph + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + return _mm512_fmadd_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_fmadd_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fmadd_round_ph + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_fmadd_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask3_fmadd_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmadd_round_ph + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fmadd_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_fmadd_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fmadd_round_ph + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fmadd_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_fmsub_round_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fmsub_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + return _mm512_fmsub_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_fmsub_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fmsub_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_fmsub_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_fmsub_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fmsub_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fmsub_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_fnmadd_round_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fnmadd_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + return _mm512_fnmadd_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask3_fnmadd_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fnmadd_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fnmadd_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_fnmadd_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fnmadd_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fnmadd_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_fnmsub_round_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fnmsub_round_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + return _mm512_fnmsub_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_fnmsub_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fnmsub_round_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fnmsub_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_fmadd_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fmadd_ph + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + return _mm512_fmadd_ph(__A, __B, __C); +} + +__m512h test_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fmadd_ph + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + return _mm512_mask_fmadd_ph(__A, __U, __B, __C); +} + +__m512h test_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmadd_ph + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fmadd_ph(__A, __B, __C, __U); +} + +__m512h test_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fmadd_ph + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fmadd_ph(__U, __A, __B, __C); +} + +__m512h test_mm512_fmsub_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fmsub_ph + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + return _mm512_fmsub_ph(__A, __B, __C); +} + +__m512h test_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fmsub_ph + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_fmsub_ph(__A, __U, __B, __C); +} + +__m512h test_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fmsub_ph + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fmsub_ph(__U, __A, __B, __C); +} + +__m512h test_mm512_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fnmadd_ph + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + return _mm512_fnmadd_ph(__A, __B, __C); +} + +__m512h test_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fnmadd_ph + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fnmadd_ph(__A, __B, __C, __U); +} + +__m512h test_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fnmadd_ph + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fnmadd_ph(__U, __A, __B, __C); +} + +__m512h test_mm512_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + return _mm512_fnmsub_ph(__A, __B, __C); +} + +__m512h test_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fnmsub_ph(__U, __A, __B, __C); +} + +__m512h test_mm512_fmaddsub_round_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fmaddsub_round_ph + // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512 + return _mm512_fmaddsub_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_fmaddsub_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fmaddsub_round_ph + // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_fmaddsub_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask3_fmaddsub_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmaddsub_round_ph + // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fmaddsub_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_fmaddsub_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_ph + // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fmaddsub_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_fmsubadd_round_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fmsubadd_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512 + return _mm512_fmsubadd_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_fmsubadd_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fmsubadd_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_fmsubadd_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_fmsubadd_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fmsubadd_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fmsubadd_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4) + return _mm512_fmaddsub_ph(__A, __B, __C); +} + +__m512h test_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_fmaddsub_ph(__A, __U, __B, __C); +} + +__m512h test_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fmaddsub_ph(__A, __B, __C, __U); +} + +__m512h test_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fmaddsub_ph(__U, __A, __B, __C); +} + +__m512h test_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4) + return _mm512_fmsubadd_ph(__A, __B, __C); +} + +__m512h test_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_fmsubadd_ph(__A, __U, __B, __C); +} + +__m512h test_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_maskz_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer + return _mm512_maskz_fmsubadd_ph(__U, __A, __B, __C); +} + +__m512h test_mm512_mask3_fmsub_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmsub_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fmsub_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmsub_ph + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fmsub_ph(__A, __B, __C, __U); +} + +__m512h test_mm512_mask3_fmsubadd_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmsubadd_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fmsubadd_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fmsubadd_ph(__A, __B, __C, __U); +} + +__m512h test_mm512_mask_fnmadd_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fnmadd_round_ph + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_fnmadd_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fnmadd_ph + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_fnmadd_ph(__A, __U, __B, __C); +} + +__m512h test_mm512_mask_fnmsub_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fnmsub_round_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_fnmsub_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask3_fnmsub_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fnmsub_round_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fnmsub_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_fnmsub_ph(__A, __U, __B, __C); +} + +__m512h test_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + // CHECK-LABEL: @test_mm512_mask3_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask3_fnmsub_ph(__A, __B, __C, __U); +} + +__m128h test_mm_fmadd_sh(__m128h __W, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_fmadd_sh + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + return _mm_fmadd_sh(__W, __A, __B); +} + +__m128h test_mm_mask_fmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fmadd_sh + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_mask_fmadd_sh(__W, __U, __A, __B); +} + +__m128h test_mm_fmadd_round_sh(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fmadd_round_sh + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[FMA]], i64 0 + return _mm_fmadd_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask_fmadd_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fmadd_round_sh + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_mask_fmadd_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fmadd_sh + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000 + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_maskz_fmadd_sh(__U, __A, __B, __C); +} + +__m128h test_mm_maskz_fmadd_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fmadd_round_sh + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000 + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_maskz_fmadd_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmadd_sh + // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0 + return _mm_mask3_fmadd_sh(__W, __X, __Y, __U); +} + +__m128h test_mm_mask3_fmadd_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmadd_round_sh + // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0 + return _mm_mask3_fmadd_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_fmsub_sh(__m128h __W, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_fmsub_sh + // CHECK: %{{.*}} = fneg <8 x half> %{{.*}} + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = call half @llvm.fma.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}) + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + // CHECK-NEXT: ret <8 x half> %{{.*}} + return _mm_fmsub_sh(__W, __A, __B); +} + +__m128h test_mm_mask_fmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fmsub_sh + // CHECK: %{{.*}} = fneg <8 x half> %{{.*}} + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = call half @llvm.fma.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}) + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + // CHECK-NEXT: ret <8 x half> %{{.*}} + return _mm_mask_fmsub_sh(__W, __U, __A, __B); +} + +__m128h test_mm_fmsub_round_sh(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fmsub_round_sh + // CHECK: %{{.*}} = fneg <8 x half> %{{.*}} + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = call half @llvm.x86.avx512fp16.vfmadd.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}, i32 11) + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + // CHECK-NEXT: ret <8 x half> %{{.*}} + return _mm_fmsub_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask_fmsub_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fmsub_round_sh + // CHECK: %{{.*}} = fneg <8 x half> %{{.*}} + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = call half @llvm.x86.avx512fp16.vfmadd.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}, i32 11) + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + // CHECK-NEXT: ret <8 x half> %{{.*}} + return _mm_mask_fmsub_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fmsub_sh + // CHECK: %{{.*}} = fneg <8 x half> %{{.*}} + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = call half @llvm.fma.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}) + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half 0xH0000 + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + // CHECK-NEXT: ret <8 x half> %{{.*}} + return _mm_maskz_fmsub_sh(__U, __A, __B, __C); +} + +__m128h test_mm_maskz_fmsub_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fmsub_round_sh + // CHECK: %{{.*}} = fneg <8 x half> %{{.*}} + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = call half @llvm.x86.avx512fp16.vfmadd.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}, i32 11) + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half 0xH0000 + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + // CHECK-NEXT: ret <8 x half> %{{.*}} + return _mm_maskz_fmsub_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmsub_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0 + return _mm_mask3_fmsub_sh(__W, __X, __Y, __U); +} + +__m128h test_mm_mask3_fmsub_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmsub_round_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0 + return _mm_mask3_fmsub_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_fnmadd_sh(__m128h __W, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_fnmadd_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + return _mm_fnmadd_sh(__W, __A, __B); +} + +__m128h test_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fnmadd_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_mask_fnmadd_sh(__W, __U, __A, __B); +} + +__m128h test_mm_fnmadd_round_sh(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fnmadd_round_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[FMA]], i64 0 + return _mm_fnmadd_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask_fnmadd_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fnmadd_round_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_mask_fnmadd_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fnmadd_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000 + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_maskz_fnmadd_sh(__U, __A, __B, __C); +} + +__m128h test_mm_maskz_fnmadd_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fnmadd_round_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000 + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_maskz_fnmadd_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fnmadd_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0 + return _mm_mask3_fnmadd_sh(__W, __X, __Y, __U); +} + +__m128h test_mm_mask3_fnmadd_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fnmadd_round_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0 + return _mm_mask3_fnmadd_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_fnmsub_sh(__m128h __W, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_fnmsub_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[NEG2:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + return _mm_fnmsub_sh(__W, __A, __B); +} + +__m128h test_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fnmsub_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[NEG2:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_mask_fnmsub_sh(__W, __U, __A, __B); +} + +__m128h test_mm_fnmsub_round_sh(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fnmsub_round_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[NEG2:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[FMA]], i64 0 + return _mm_fnmsub_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask_fnmsub_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_fnmsub_round_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[NEG2:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_mask_fnmsub_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fnmsub_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[NEG2:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000 + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_maskz_fnmsub_sh(__U, __A, __B, __C); +} + +__m128h test_mm_maskz_fnmsub_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fnmsub_round_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[NEG2:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000 + // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0 + return _mm_maskz_fnmsub_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fnmsub_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[NEG2:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]]) + // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0 + return _mm_mask3_fnmsub_sh(__W, __X, __Y, __U); +} + +__m128h test_mm_mask3_fnmsub_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fnmsub_round_sh + // CHECK: [[NEG:%.+]] = fneg + // CHECK: [[NEG2:%.+]] = fneg + // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11) + // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]] + // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0 + return _mm_mask3_fnmsub_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + _Float16 test_mm512_reduce_add_ph(__m512h __W) { // CHECK-LABEL: @test_mm512_reduce_add_ph // CHECK: call reassoc half @llvm.vector.reduce.fadd.v32f16(half 0xH8000, <32 x half> %{{.*}}) diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c --- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c @@ -2321,6 +2321,411 @@ return _mm256_maskz_cvtxps_ph(A, B); } +__m128h test_mm_fmadd_ph(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fmadd_ph + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + return _mm_fmadd_ph(__A, __B, __C); +} + +__m128h test_mm_mask_fmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fmadd_ph + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask_fmadd_ph(__A, __U, __B, __C); +} + +__m128h test_mm_fmsub_ph(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fmsub_ph + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + return _mm_fmsub_ph(__A, __B, __C); +} + +__m128h test_mm_mask_fmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fmsub_ph + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask_fmsub_ph(__A, __U, __B, __C); +} + +__m128h test_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmadd_ph + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask3_fmadd_ph(__A, __B, __C, __U); +} + +__m128h test_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fnmadd_ph + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask3_fnmadd_ph(__A, __B, __C, __U); +} + +__m128h test_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fmadd_ph + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_fmadd_ph(__U, __A, __B, __C); +} + +__m128h test_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fmsub_ph + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_fmsub_ph(__U, __A, __B, __C); +} + +__m128h test_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fnmadd_ph + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_fnmadd_ph(__U, __A, __B, __C); +} + +__m128h test_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_fnmsub_ph(__U, __A, __B, __C); +} + +__m256h test_mm256_fmadd_ph(__m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_fmadd_ph + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + return _mm256_fmadd_ph(__A, __B, __C); +} + +__m256h test_mm256_mask_fmadd_ph(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_fmadd_ph + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask_fmadd_ph(__A, __U, __B, __C); +} + +__m256h test_mm256_fmsub_ph(__m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_fmsub_ph + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + return _mm256_fmsub_ph(__A, __B, __C); +} + +__m256h test_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_fmsub_ph + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask_fmsub_ph(__A, __U, __B, __C); +} + +__m256h test_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm256_mask3_fmadd_ph + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask3_fmadd_ph(__A, __B, __C, __U); +} + +__m256h test_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm256_mask3_fnmadd_ph + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask3_fnmadd_ph(__A, __B, __C, __U); +} + +__m256h test_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_maskz_fmadd_ph + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_fmadd_ph(__U, __A, __B, __C); +} + +__m256h test_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_maskz_fmsub_ph + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_fmsub_ph(__U, __A, __B, __C); +} + +__m256h test_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_maskz_fnmadd_ph + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_fnmadd_ph(__U, __A, __B, __C); +} + +__m256h test_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_maskz_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_fnmsub_ph(__U, __A, __B, __C); +} + +__m128h test_mm_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + return _mm_fmaddsub_ph(__A, __B, __C); +} + +__m128h test_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask_fmaddsub_ph(__A, __U, __B, __C); +} + +__m128h test_mm_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> [[NEG]]) + return _mm_fmsubadd_ph(__A, __B, __C); +} + +__m128h test_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> [[NEG]]) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask_fmsubadd_ph(__A, __U, __B, __C); +} + +__m128h test_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask3_fmaddsub_ph(__A, __B, __C, __U); +} + +__m128h test_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_fmaddsub_ph(__U, __A, __B, __C); +} + +__m128h test_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_maskz_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> [[NEG]]) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_fmsubadd_ph(__U, __A, __B, __C); +} + +__m256h test_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + return _mm256_fmaddsub_ph(__A, __B, __C); +} + +__m256h test_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask_fmaddsub_ph(__A, __U, __B, __C); +} + +__m256h test_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> [[NEG]]) + return _mm256_fmsubadd_ph(__A, __B, __C); +} + +__m256h test_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> [[NEG]]) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask_fmsubadd_ph(__A, __U, __B, __C); +} + +__m256h test_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm256_mask3_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask3_fmaddsub_ph(__A, __B, __C, __U); +} + +__m256h test_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_maskz_fmaddsub_ph + // CHECK-NOT: fneg + // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_fmaddsub_ph(__U, __A, __B, __C); +} + +__m256h test_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_maskz_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> [[NEG]]) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_fmsubadd_ph(__U, __A, __B, __C); +} + +__m128h test_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmsub_ph + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask3_fmsub_ph(__A, __B, __C, __U); +} + +__m256h test_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm256_mask3_fmsub_ph + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask3_fmsub_ph(__A, __B, __C, __U); +} + +__m128h test_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> [[NEG]]) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask3_fmsubadd_ph(__A, __B, __C, __U); +} + +__m256h test_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm256_mask3_fmsubadd_ph + // CHECK: [[NEG:%.+]] = fneg + // CHECK: call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> [[NEG]]) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask3_fmsubadd_ph(__A, __B, __C, __U); +} + +__m128h test_mm_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fnmadd_ph + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + return _mm_fnmadd_ph(__A, __B, __C); +} + +__m128h test_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fnmadd_ph + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask_fnmadd_ph(__A, __U, __B, __C); +} + +__m256h test_mm256_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_fnmadd_ph + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + return _mm256_fnmadd_ph(__A, __B, __C); +} + +__m256h test_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_fnmadd_ph + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask_fnmadd_ph(__A, __U, __B, __C); +} + +__m128h test_mm_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + return _mm_fnmsub_ph(__A, __B, __C); +} + +__m128h test_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask_fnmsub_ph(__A, __U, __B, __C); +} + +__m128h test_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <8 x half> @llvm.fma.v8f16(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask3_fnmsub_ph(__A, __B, __C, __U); +} + +__m256h test_mm256_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + return _mm256_fnmsub_ph(__A, __B, __C); +} + +__m256h test_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask_fnmsub_ph(__A, __U, __B, __C); +} + +__m256h test_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + // CHECK-LABEL: @test_mm256_mask3_fnmsub_ph + // CHECK: fneg + // CHECK: fneg + // CHECK: call <16 x half> @llvm.fma.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}) + return _mm256_mask3_fnmsub_ph(__A, __B, __C, __U); +} __m128h test_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { // CHECK-LABEL: @test_mm_mask_blend_ph // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5709,4 +5709,27 @@ [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, llvm_i32_ty ], [ IntrNoMem, ImmArg> ]>; + + def int_x86_avx512fp16_vfmadd_ph_512 + : Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vfmaddsub_ph_128 + : GCCBuiltin<"__builtin_ia32_vfmaddsubph">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_vfmaddsub_ph_256 + : GCCBuiltin<"__builtin_ia32_vfmaddsubph256">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_v16f16_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_vfmaddsub_ph_512 + : Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vfmadd_f16 + : Intrinsic<[ llvm_half_ty ], + [ llvm_half_ty, llvm_half_ty, llvm_half_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1934,6 +1934,8 @@ setOperationAction(ISD::LOAD, VT, Legal); setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); @@ -32720,6 +32722,8 @@ return false; switch (VT.getSimpleVT().SimpleTy) { + case MVT::f16: + return Subtarget.hasFP16(); case MVT::f32: case MVT::f64: return true; @@ -49021,7 +49025,9 @@ } EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA()) + if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || + !Subtarget.hasAnyFMA()) && + !(ScalarVT == MVT::f16 && Subtarget.hasFP16())) return SDValue(); auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) { diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -6760,14 +6760,14 @@ OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>, - AVX512FMA3Base, Sched<[sched]>; + EVEX_4V, Sched<[sched]>; defm m: AVX512_maskable_fma, - AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>; + EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; defm mb: AVX512_maskable_fma, - AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -6791,21 +6791,22 @@ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; + EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fma3p_213_common opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeRnd, X86SchedWriteWidths sched, - AVX512VLVectorVTInfo _, string Suff> { - let Predicates = [HasAVX512] in { + AVX512VLVectorVTInfo _, string Suff, + Predicate prd = HasAVX512> { + let Predicates = [prd] in { defm Z : avx512_fma3p_213_rm, avx512_fma3_213_round, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } - let Predicates = [HasVLX, HasAVX512] in { + let Predicates = [HasVLX, prd] in { defm Z256 : avx512_fma3p_213_rm, EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; @@ -6817,12 +6818,15 @@ multiclass avx512_fma3p_213_f opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeRnd> { + defm PH : avx512_fma3p_213_common, T_MAP6PD; defm PS : avx512_fma3p_213_common; + avx512vl_f32_info, "PS">, T8PD; defm PD : avx512_fma3p_213_common, VEX_W; + avx512vl_f64_info, "PD">, T8PD, VEX_W; } defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma, @@ -6849,14 +6853,14 @@ OpcodeStr, "$src3, $src2", "$src2, $src3", (null_frag), (_.VT (MaskOpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>, - AVX512FMA3Base, Sched<[sched]>; + EVEX_4V, Sched<[sched]>; defm m: AVX512_maskable_fma, - AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>; + EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; defm mb: AVX512_maskable_fma, AVX512FMA3Base, EVEX_B, + _.RC:$src1)), 1, 0>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -6882,21 +6886,22 @@ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))), - 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; + 1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fma3p_231_common opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeRnd, X86SchedWriteWidths sched, - AVX512VLVectorVTInfo _, string Suff> { - let Predicates = [HasAVX512] in { + AVX512VLVectorVTInfo _, string Suff, + Predicate prd = HasAVX512> { + let Predicates = [prd] in { defm Z : avx512_fma3p_231_rm, avx512_fma3_231_round, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } - let Predicates = [HasVLX, HasAVX512] in { + let Predicates = [HasVLX, prd] in { defm Z256 : avx512_fma3p_231_rm, EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; @@ -6908,12 +6913,15 @@ multiclass avx512_fma3p_231_f opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeRnd > { + defm PH : avx512_fma3p_231_common, T_MAP6PD; defm PS : avx512_fma3p_231_common; + avx512vl_f32_info, "PS">, T8PD; defm PD : avx512_fma3p_231_common, VEX_W; + avx512vl_f64_info, "PD">, T8PD, VEX_W; } defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma, @@ -6939,7 +6947,7 @@ OpcodeStr, "$src3, $src2", "$src2, $src3", (null_frag), (_.VT (MaskOpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>, - AVX512FMA3Base, Sched<[sched]>; + EVEX_4V, Sched<[sched]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -6948,7 +6956,7 @@ OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), (_.VT (MaskOpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>, - AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>; + EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -6960,7 +6968,7 @@ _.RC:$src1, _.RC:$src2)), (_.VT (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1, _.RC:$src2)), 1, 0>, - AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -6974,21 +6982,22 @@ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))), - 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; + 1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fma3p_132_common opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeRnd, X86SchedWriteWidths sched, - AVX512VLVectorVTInfo _, string Suff> { - let Predicates = [HasAVX512] in { + AVX512VLVectorVTInfo _, string Suff, + Predicate prd = HasAVX512> { + let Predicates = [prd] in { defm Z : avx512_fma3p_132_rm, avx512_fma3_132_round, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } - let Predicates = [HasVLX, HasAVX512] in { + let Predicates = [HasVLX, prd] in { defm Z256 : avx512_fma3p_132_rm, EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; @@ -7000,12 +7009,15 @@ multiclass avx512_fma3p_132_f opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeRnd > { + defm PH : avx512_fma3p_132_common, T_MAP6PD; defm PS : avx512_fma3p_132_common; + avx512vl_f32_info, "PS">, T8PD; defm PD : avx512_fma3p_132_common, VEX_W; + avx512vl_f64_info, "PD">, T8PD, VEX_W; } defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma, @@ -7028,39 +7040,39 @@ defm r_Int: AVX512_maskable_3src_scalar, - AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC; + EVEX_4V, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC; let mayLoad = 1 in defm m_Int: AVX512_maskable_3src_scalar, - AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC; + EVEX_4V, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC; let Uses = [MXCSR] in defm rb_Int: AVX512_maskable_3src_scalar, - AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>; + EVEX_4V, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>; let isCodeGenOnly = 1, isCommutable = 1 in { - def r : AVX512FMA3S, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC; - def m : AVX512FMA3S, Sched<[SchedWriteFMA.Scl]>, EVEX_4V, SIMD_EXC; + def m : AVX512, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC; + [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, EVEX_4V, SIMD_EXC; let Uses = [MXCSR] in - def rb : AVX512FMA3S, EVEX_B, EVEX_RC, - Sched<[SchedWriteFMA.Scl]>; + Sched<[SchedWriteFMA.Scl]>, EVEX_4V; }// isCodeGenOnly = 1 }// Constraints = "$src1 = $dst" } @@ -7104,10 +7116,15 @@ let Predicates = [HasAVX512] in { defm NAME : avx512_fma3s_all, - EVEX_CD8<32, CD8VT1>, VEX_LIG; + EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD; defm NAME : avx512_fma3s_all, - EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; + EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W, T8PD; + } + let Predicates = [HasFP16] in { + defm NAME : avx512_fma3s_all, + EVEX_CD8<16, CD8VT1>, VEX_LIG, T_MAP6PD; } } @@ -7119,8 +7136,9 @@ multiclass avx512_scalar_fma_patterns { - let Predicates = [HasAVX512] in { + X86VectorVTInfo _, PatLeaf ZeroFP, + Predicate prd = HasAVX512> { + let Predicates = [prd] in { def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (Op _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), @@ -7318,6 +7336,14 @@ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; } } +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; defm : avx512_scalar_fma_patterns; @@ -7350,13 +7376,13 @@ (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>, - AVX512FMA3Base, Sched<[sched]>; + T8PD, EVEX_4V, Sched<[sched]>; defm m: AVX512_maskable_3src, - AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>; + T8PD, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; defm mb: AVX512_maskable_3src, - AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + T8PD, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } } // Constraints = "$src1 = $dst" @@ -12355,13 +12381,13 @@ (ins VTI.RC:$src2, VTI.RC:$src3), OpStr, "$src3, $src2", "$src2, $src3", (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>, - AVX512FMA3Base, Sched<[sched]>; + T8PD, EVEX_4V, Sched<[sched]>; defm m: AVX512_maskable_3src, - AVX512FMA3Base, + T8PD, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -12377,7 +12403,7 @@ "$src2, ${src3}"#VTI.BroadcastStr, (OpNode VTI.RC:$src1, VTI.RC:$src2, (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, - AVX512FMA3Base, EVEX_B, + T8PD, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp --- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp +++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp @@ -28,35 +28,43 @@ FMA3GROUP(Name, Suf##k, Attrs | X86InstrFMA3Group::KMergeMasked) \ FMA3GROUP(Name, Suf##kz, Attrs | X86InstrFMA3Group::KZeroMasked) -#define FMA3GROUP_PACKED_WIDTHS(Name, Suf, Attrs) \ - FMA3GROUP(Name, Suf##Ym, Attrs) \ - FMA3GROUP(Name, Suf##Yr, Attrs) \ +#define FMA3GROUP_PACKED_WIDTHS_Z(Name, Suf, Attrs) \ FMA3GROUP_MASKED(Name, Suf##Z128m, Attrs) \ FMA3GROUP_MASKED(Name, Suf##Z128r, Attrs) \ FMA3GROUP_MASKED(Name, Suf##Z256m, Attrs) \ FMA3GROUP_MASKED(Name, Suf##Z256r, Attrs) \ FMA3GROUP_MASKED(Name, Suf##Zm, Attrs) \ FMA3GROUP_MASKED(Name, Suf##Zr, Attrs) \ + +#define FMA3GROUP_PACKED_WIDTHS_ALL(Name, Suf, Attrs) \ + FMA3GROUP(Name, Suf##Ym, Attrs) \ + FMA3GROUP(Name, Suf##Yr, Attrs) \ + FMA3GROUP_PACKED_WIDTHS_Z(Name, Suf, Attrs) \ FMA3GROUP(Name, Suf##m, Attrs) \ FMA3GROUP(Name, Suf##r, Attrs) #define FMA3GROUP_PACKED(Name, Attrs) \ - FMA3GROUP_PACKED_WIDTHS(Name, PD, Attrs) \ - FMA3GROUP_PACKED_WIDTHS(Name, PS, Attrs) + FMA3GROUP_PACKED_WIDTHS_ALL(Name, PD, Attrs) \ + FMA3GROUP_PACKED_WIDTHS_Z(Name, PH, Attrs) \ + FMA3GROUP_PACKED_WIDTHS_ALL(Name, PS, Attrs) -#define FMA3GROUP_SCALAR_WIDTHS(Name, Suf, Attrs) \ +#define FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \ FMA3GROUP(Name, Suf##Zm, Attrs) \ FMA3GROUP_MASKED(Name, Suf##Zm_Int, Attrs | X86InstrFMA3Group::Intrinsic) \ FMA3GROUP(Name, Suf##Zr, Attrs) \ FMA3GROUP_MASKED(Name, Suf##Zr_Int, Attrs | X86InstrFMA3Group::Intrinsic) \ + +#define FMA3GROUP_SCALAR_WIDTHS_ALL(Name, Suf, Attrs) \ + FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \ FMA3GROUP(Name, Suf##m, Attrs) \ FMA3GROUP(Name, Suf##m_Int, Attrs | X86InstrFMA3Group::Intrinsic) \ FMA3GROUP(Name, Suf##r, Attrs) \ FMA3GROUP(Name, Suf##r_Int, Attrs | X86InstrFMA3Group::Intrinsic) #define FMA3GROUP_SCALAR(Name, Attrs) \ - FMA3GROUP_SCALAR_WIDTHS(Name, SD, Attrs) \ - FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs) + FMA3GROUP_SCALAR_WIDTHS_ALL(Name, SD, Attrs) \ + FMA3GROUP_SCALAR_WIDTHS_Z(Name, SH, Attrs) \ + FMA3GROUP_SCALAR_WIDTHS_ALL(Name, SS, Attrs) #define FMA3GROUP_FULL(Name, Attrs) \ FMA3GROUP_PACKED(Name, Attrs) \ @@ -78,15 +86,19 @@ #define FMA3GROUP_PACKED_AVX512(Name, Suf, Attrs) \ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \ + FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs) \ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs) #define FMA3GROUP_PACKED_AVX512_ROUND(Name, Suf, Attrs) \ FMA3GROUP_MASKED(Name, PDZ##Suf, Attrs) \ + FMA3GROUP_MASKED(Name, PHZ##Suf, Attrs) \ FMA3GROUP_MASKED(Name, PSZ##Suf, Attrs) #define FMA3GROUP_SCALAR_AVX512_ROUND(Name, Suf, Attrs) \ FMA3GROUP(Name, SDZ##Suf, Attrs) \ FMA3GROUP_MASKED(Name, SDZ##Suf##_Int, Attrs) \ + FMA3GROUP(Name, SHZ##Suf, Attrs) \ + FMA3GROUP_MASKED(Name, SHZ##Suf##_Int, Attrs) \ FMA3GROUP(Name, SSZ##Suf, Attrs) \ FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs) @@ -130,14 +142,16 @@ // FMA3 instructions have a well defined encoding pattern we can exploit. uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags); - bool IsFMA3 = ((TSFlags & X86II::EncodingMask) == X86II::VEX || - (TSFlags & X86II::EncodingMask) == X86II::EVEX) && - (TSFlags & X86II::OpMapMask) == X86II::T8 && - (TSFlags & X86II::OpPrefixMask) == X86II::PD && - ((BaseOpcode >= 0x96 && BaseOpcode <= 0x9F) || - (BaseOpcode >= 0xA6 && BaseOpcode <= 0xAF) || - (BaseOpcode >= 0xB6 && BaseOpcode <= 0xBF)); - if (!IsFMA3) + bool IsFMA3Opcode = ((BaseOpcode >= 0x96 && BaseOpcode <= 0x9F) || + (BaseOpcode >= 0xA6 && BaseOpcode <= 0xAF) || + (BaseOpcode >= 0xB6 && BaseOpcode <= 0xBF)); + bool IsFMA3Encoding = ((TSFlags & X86II::EncodingMask) == X86II::VEX && + (TSFlags & X86II::OpMapMask) == X86II::T8) || + ((TSFlags & X86II::EncodingMask) == X86II::EVEX && + ((TSFlags & X86II::OpMapMask) == X86II::T8 || + (TSFlags & X86II::OpMapMask) == X86II::T_MAP6)); + bool IsFMA3Prefix = (TSFlags & X86II::OpPrefixMask) == X86II::PD; + if (!IsFMA3Opcode || !IsFMA3Encoding || !IsFMA3Prefix) return nullptr; verifyTables(); diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -3288,6 +3288,9 @@ { X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256m, 0 }, { X86::VFMADD132PDZr, X86::VFMADD132PDZm, 0 }, { X86::VFMADD132PDr, X86::VFMADD132PDm, 0 }, + { X86::VFMADD132PHZ128r, X86::VFMADD132PHZ128m, 0 }, + { X86::VFMADD132PHZ256r, X86::VFMADD132PHZ256m, 0 }, + { X86::VFMADD132PHZr, X86::VFMADD132PHZm, 0 }, { X86::VFMADD132PSYr, X86::VFMADD132PSYm, 0 }, { X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128m, 0 }, { X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256m, 0 }, @@ -3297,6 +3300,8 @@ { X86::VFMADD132SDZr_Int, X86::VFMADD132SDZm_Int, TB_NO_REVERSE }, { X86::VFMADD132SDr, X86::VFMADD132SDm, 0 }, { X86::VFMADD132SDr_Int, X86::VFMADD132SDm_Int, TB_NO_REVERSE }, + { X86::VFMADD132SHZr, X86::VFMADD132SHZm, 0 }, + { X86::VFMADD132SHZr_Int, X86::VFMADD132SHZm_Int, TB_NO_REVERSE }, { X86::VFMADD132SSZr, X86::VFMADD132SSZm, 0 }, { X86::VFMADD132SSZr_Int, X86::VFMADD132SSZm_Int, TB_NO_REVERSE }, { X86::VFMADD132SSr, X86::VFMADD132SSm, 0 }, @@ -3306,6 +3311,9 @@ { X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256m, 0 }, { X86::VFMADD213PDZr, X86::VFMADD213PDZm, 0 }, { X86::VFMADD213PDr, X86::VFMADD213PDm, 0 }, + { X86::VFMADD213PHZ128r, X86::VFMADD213PHZ128m, 0 }, + { X86::VFMADD213PHZ256r, X86::VFMADD213PHZ256m, 0 }, + { X86::VFMADD213PHZr, X86::VFMADD213PHZm, 0 }, { X86::VFMADD213PSYr, X86::VFMADD213PSYm, 0 }, { X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128m, 0 }, { X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256m, 0 }, @@ -3315,6 +3323,8 @@ { X86::VFMADD213SDZr_Int, X86::VFMADD213SDZm_Int, TB_NO_REVERSE }, { X86::VFMADD213SDr, X86::VFMADD213SDm, 0 }, { X86::VFMADD213SDr_Int, X86::VFMADD213SDm_Int, TB_NO_REVERSE }, + { X86::VFMADD213SHZr, X86::VFMADD213SHZm, 0 }, + { X86::VFMADD213SHZr_Int, X86::VFMADD213SHZm_Int, TB_NO_REVERSE }, { X86::VFMADD213SSZr, X86::VFMADD213SSZm, 0 }, { X86::VFMADD213SSZr_Int, X86::VFMADD213SSZm_Int, TB_NO_REVERSE }, { X86::VFMADD213SSr, X86::VFMADD213SSm, 0 }, @@ -3324,6 +3334,9 @@ { X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256m, 0 }, { X86::VFMADD231PDZr, X86::VFMADD231PDZm, 0 }, { X86::VFMADD231PDr, X86::VFMADD231PDm, 0 }, + { X86::VFMADD231PHZ128r, X86::VFMADD231PHZ128m, 0 }, + { X86::VFMADD231PHZ256r, X86::VFMADD231PHZ256m, 0 }, + { X86::VFMADD231PHZr, X86::VFMADD231PHZm, 0 }, { X86::VFMADD231PSYr, X86::VFMADD231PSYm, 0 }, { X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128m, 0 }, { X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256m, 0 }, @@ -3333,6 +3346,8 @@ { X86::VFMADD231SDZr_Int, X86::VFMADD231SDZm_Int, TB_NO_REVERSE }, { X86::VFMADD231SDr, X86::VFMADD231SDm, 0 }, { X86::VFMADD231SDr_Int, X86::VFMADD231SDm_Int, TB_NO_REVERSE }, + { X86::VFMADD231SHZr, X86::VFMADD231SHZm, 0 }, + { X86::VFMADD231SHZr_Int, X86::VFMADD231SHZm_Int, TB_NO_REVERSE }, { X86::VFMADD231SSZr, X86::VFMADD231SSZm, 0 }, { X86::VFMADD231SSZr_Int, X86::VFMADD231SSZm_Int, TB_NO_REVERSE }, { X86::VFMADD231SSr, X86::VFMADD231SSm, 0 }, @@ -3350,6 +3365,9 @@ { X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256m, 0 }, { X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZm, 0 }, { X86::VFMADDSUB132PDr, X86::VFMADDSUB132PDm, 0 }, + { X86::VFMADDSUB132PHZ128r, X86::VFMADDSUB132PHZ128m, 0 }, + { X86::VFMADDSUB132PHZ256r, X86::VFMADDSUB132PHZ256m, 0 }, + { X86::VFMADDSUB132PHZr, X86::VFMADDSUB132PHZm, 0 }, { X86::VFMADDSUB132PSYr, X86::VFMADDSUB132PSYm, 0 }, { X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128m, 0 }, { X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256m, 0 }, @@ -3360,6 +3378,9 @@ { X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256m, 0 }, { X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZm, 0 }, { X86::VFMADDSUB213PDr, X86::VFMADDSUB213PDm, 0 }, + { X86::VFMADDSUB213PHZ128r, X86::VFMADDSUB213PHZ128m, 0 }, + { X86::VFMADDSUB213PHZ256r, X86::VFMADDSUB213PHZ256m, 0 }, + { X86::VFMADDSUB213PHZr, X86::VFMADDSUB213PHZm, 0 }, { X86::VFMADDSUB213PSYr, X86::VFMADDSUB213PSYm, 0 }, { X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128m, 0 }, { X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256m, 0 }, @@ -3370,6 +3391,9 @@ { X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256m, 0 }, { X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZm, 0 }, { X86::VFMADDSUB231PDr, X86::VFMADDSUB231PDm, 0 }, + { X86::VFMADDSUB231PHZ128r, X86::VFMADDSUB231PHZ128m, 0 }, + { X86::VFMADDSUB231PHZ256r, X86::VFMADDSUB231PHZ256m, 0 }, + { X86::VFMADDSUB231PHZr, X86::VFMADDSUB231PHZm, 0 }, { X86::VFMADDSUB231PSYr, X86::VFMADDSUB231PSYm, 0 }, { X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128m, 0 }, { X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256m, 0 }, @@ -3384,6 +3408,9 @@ { X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256m, 0 }, { X86::VFMSUB132PDZr, X86::VFMSUB132PDZm, 0 }, { X86::VFMSUB132PDr, X86::VFMSUB132PDm, 0 }, + { X86::VFMSUB132PHZ128r, X86::VFMSUB132PHZ128m, 0 }, + { X86::VFMSUB132PHZ256r, X86::VFMSUB132PHZ256m, 0 }, + { X86::VFMSUB132PHZr, X86::VFMSUB132PHZm, 0 }, { X86::VFMSUB132PSYr, X86::VFMSUB132PSYm, 0 }, { X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128m, 0 }, { X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256m, 0 }, @@ -3393,6 +3420,8 @@ { X86::VFMSUB132SDZr_Int, X86::VFMSUB132SDZm_Int, TB_NO_REVERSE }, { X86::VFMSUB132SDr, X86::VFMSUB132SDm, 0 }, { X86::VFMSUB132SDr_Int, X86::VFMSUB132SDm_Int, TB_NO_REVERSE }, + { X86::VFMSUB132SHZr, X86::VFMSUB132SHZm, 0 }, + { X86::VFMSUB132SHZr_Int, X86::VFMSUB132SHZm_Int, TB_NO_REVERSE }, { X86::VFMSUB132SSZr, X86::VFMSUB132SSZm, 0 }, { X86::VFMSUB132SSZr_Int, X86::VFMSUB132SSZm_Int, TB_NO_REVERSE }, { X86::VFMSUB132SSr, X86::VFMSUB132SSm, 0 }, @@ -3402,6 +3431,9 @@ { X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256m, 0 }, { X86::VFMSUB213PDZr, X86::VFMSUB213PDZm, 0 }, { X86::VFMSUB213PDr, X86::VFMSUB213PDm, 0 }, + { X86::VFMSUB213PHZ128r, X86::VFMSUB213PHZ128m, 0 }, + { X86::VFMSUB213PHZ256r, X86::VFMSUB213PHZ256m, 0 }, + { X86::VFMSUB213PHZr, X86::VFMSUB213PHZm, 0 }, { X86::VFMSUB213PSYr, X86::VFMSUB213PSYm, 0 }, { X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128m, 0 }, { X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256m, 0 }, @@ -3411,6 +3443,8 @@ { X86::VFMSUB213SDZr_Int, X86::VFMSUB213SDZm_Int, TB_NO_REVERSE }, { X86::VFMSUB213SDr, X86::VFMSUB213SDm, 0 }, { X86::VFMSUB213SDr_Int, X86::VFMSUB213SDm_Int, TB_NO_REVERSE }, + { X86::VFMSUB213SHZr, X86::VFMSUB213SHZm, 0 }, + { X86::VFMSUB213SHZr_Int, X86::VFMSUB213SHZm_Int, TB_NO_REVERSE }, { X86::VFMSUB213SSZr, X86::VFMSUB213SSZm, 0 }, { X86::VFMSUB213SSZr_Int, X86::VFMSUB213SSZm_Int, TB_NO_REVERSE }, { X86::VFMSUB213SSr, X86::VFMSUB213SSm, 0 }, @@ -3420,6 +3454,9 @@ { X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256m, 0 }, { X86::VFMSUB231PDZr, X86::VFMSUB231PDZm, 0 }, { X86::VFMSUB231PDr, X86::VFMSUB231PDm, 0 }, + { X86::VFMSUB231PHZ128r, X86::VFMSUB231PHZ128m, 0 }, + { X86::VFMSUB231PHZ256r, X86::VFMSUB231PHZ256m, 0 }, + { X86::VFMSUB231PHZr, X86::VFMSUB231PHZm, 0 }, { X86::VFMSUB231PSYr, X86::VFMSUB231PSYm, 0 }, { X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128m, 0 }, { X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256m, 0 }, @@ -3429,6 +3466,8 @@ { X86::VFMSUB231SDZr_Int, X86::VFMSUB231SDZm_Int, TB_NO_REVERSE }, { X86::VFMSUB231SDr, X86::VFMSUB231SDm, 0 }, { X86::VFMSUB231SDr_Int, X86::VFMSUB231SDm_Int, TB_NO_REVERSE }, + { X86::VFMSUB231SHZr, X86::VFMSUB231SHZm, 0 }, + { X86::VFMSUB231SHZr_Int, X86::VFMSUB231SHZm_Int, TB_NO_REVERSE }, { X86::VFMSUB231SSZr, X86::VFMSUB231SSZm, 0 }, { X86::VFMSUB231SSZr_Int, X86::VFMSUB231SSZm_Int, TB_NO_REVERSE }, { X86::VFMSUB231SSr, X86::VFMSUB231SSm, 0 }, @@ -3438,6 +3477,9 @@ { X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256m, 0 }, { X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZm, 0 }, { X86::VFMSUBADD132PDr, X86::VFMSUBADD132PDm, 0 }, + { X86::VFMSUBADD132PHZ128r, X86::VFMSUBADD132PHZ128m, 0 }, + { X86::VFMSUBADD132PHZ256r, X86::VFMSUBADD132PHZ256m, 0 }, + { X86::VFMSUBADD132PHZr, X86::VFMSUBADD132PHZm, 0 }, { X86::VFMSUBADD132PSYr, X86::VFMSUBADD132PSYm, 0 }, { X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128m, 0 }, { X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256m, 0 }, @@ -3448,6 +3490,9 @@ { X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256m, 0 }, { X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZm, 0 }, { X86::VFMSUBADD213PDr, X86::VFMSUBADD213PDm, 0 }, + { X86::VFMSUBADD213PHZ128r, X86::VFMSUBADD213PHZ128m, 0 }, + { X86::VFMSUBADD213PHZ256r, X86::VFMSUBADD213PHZ256m, 0 }, + { X86::VFMSUBADD213PHZr, X86::VFMSUBADD213PHZm, 0 }, { X86::VFMSUBADD213PSYr, X86::VFMSUBADD213PSYm, 0 }, { X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128m, 0 }, { X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256m, 0 }, @@ -3458,6 +3503,9 @@ { X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256m, 0 }, { X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZm, 0 }, { X86::VFMSUBADD231PDr, X86::VFMSUBADD231PDm, 0 }, + { X86::VFMSUBADD231PHZ128r, X86::VFMSUBADD231PHZ128m, 0 }, + { X86::VFMSUBADD231PHZ256r, X86::VFMSUBADD231PHZ256m, 0 }, + { X86::VFMSUBADD231PHZr, X86::VFMSUBADD231PHZm, 0 }, { X86::VFMSUBADD231PSYr, X86::VFMSUBADD231PSYm, 0 }, { X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128m, 0 }, { X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256m, 0 }, @@ -3480,6 +3528,9 @@ { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256m, 0 }, { X86::VFNMADD132PDZr, X86::VFNMADD132PDZm, 0 }, { X86::VFNMADD132PDr, X86::VFNMADD132PDm, 0 }, + { X86::VFNMADD132PHZ128r, X86::VFNMADD132PHZ128m, 0 }, + { X86::VFNMADD132PHZ256r, X86::VFNMADD132PHZ256m, 0 }, + { X86::VFNMADD132PHZr, X86::VFNMADD132PHZm, 0 }, { X86::VFNMADD132PSYr, X86::VFNMADD132PSYm, 0 }, { X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128m, 0 }, { X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256m, 0 }, @@ -3489,6 +3540,8 @@ { X86::VFNMADD132SDZr_Int, X86::VFNMADD132SDZm_Int, TB_NO_REVERSE }, { X86::VFNMADD132SDr, X86::VFNMADD132SDm, 0 }, { X86::VFNMADD132SDr_Int, X86::VFNMADD132SDm_Int, TB_NO_REVERSE }, + { X86::VFNMADD132SHZr, X86::VFNMADD132SHZm, 0 }, + { X86::VFNMADD132SHZr_Int, X86::VFNMADD132SHZm_Int, TB_NO_REVERSE }, { X86::VFNMADD132SSZr, X86::VFNMADD132SSZm, 0 }, { X86::VFNMADD132SSZr_Int, X86::VFNMADD132SSZm_Int, TB_NO_REVERSE }, { X86::VFNMADD132SSr, X86::VFNMADD132SSm, 0 }, @@ -3498,6 +3551,9 @@ { X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256m, 0 }, { X86::VFNMADD213PDZr, X86::VFNMADD213PDZm, 0 }, { X86::VFNMADD213PDr, X86::VFNMADD213PDm, 0 }, + { X86::VFNMADD213PHZ128r, X86::VFNMADD213PHZ128m, 0 }, + { X86::VFNMADD213PHZ256r, X86::VFNMADD213PHZ256m, 0 }, + { X86::VFNMADD213PHZr, X86::VFNMADD213PHZm, 0 }, { X86::VFNMADD213PSYr, X86::VFNMADD213PSYm, 0 }, { X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128m, 0 }, { X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256m, 0 }, @@ -3507,6 +3563,8 @@ { X86::VFNMADD213SDZr_Int, X86::VFNMADD213SDZm_Int, TB_NO_REVERSE }, { X86::VFNMADD213SDr, X86::VFNMADD213SDm, 0 }, { X86::VFNMADD213SDr_Int, X86::VFNMADD213SDm_Int, TB_NO_REVERSE }, + { X86::VFNMADD213SHZr, X86::VFNMADD213SHZm, 0 }, + { X86::VFNMADD213SHZr_Int, X86::VFNMADD213SHZm_Int, TB_NO_REVERSE }, { X86::VFNMADD213SSZr, X86::VFNMADD213SSZm, 0 }, { X86::VFNMADD213SSZr_Int, X86::VFNMADD213SSZm_Int, TB_NO_REVERSE }, { X86::VFNMADD213SSr, X86::VFNMADD213SSm, 0 }, @@ -3516,6 +3574,9 @@ { X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256m, 0 }, { X86::VFNMADD231PDZr, X86::VFNMADD231PDZm, 0 }, { X86::VFNMADD231PDr, X86::VFNMADD231PDm, 0 }, + { X86::VFNMADD231PHZ128r, X86::VFNMADD231PHZ128m, 0 }, + { X86::VFNMADD231PHZ256r, X86::VFNMADD231PHZ256m, 0 }, + { X86::VFNMADD231PHZr, X86::VFNMADD231PHZm, 0 }, { X86::VFNMADD231PSYr, X86::VFNMADD231PSYm, 0 }, { X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128m, 0 }, { X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256m, 0 }, @@ -3525,6 +3586,8 @@ { X86::VFNMADD231SDZr_Int, X86::VFNMADD231SDZm_Int, TB_NO_REVERSE }, { X86::VFNMADD231SDr, X86::VFNMADD231SDm, 0 }, { X86::VFNMADD231SDr_Int, X86::VFNMADD231SDm_Int, TB_NO_REVERSE }, + { X86::VFNMADD231SHZr, X86::VFNMADD231SHZm, 0 }, + { X86::VFNMADD231SHZr_Int, X86::VFNMADD231SHZm_Int, TB_NO_REVERSE }, { X86::VFNMADD231SSZr, X86::VFNMADD231SSZm, 0 }, { X86::VFNMADD231SSZr_Int, X86::VFNMADD231SSZm_Int, TB_NO_REVERSE }, { X86::VFNMADD231SSr, X86::VFNMADD231SSm, 0 }, @@ -3542,6 +3605,9 @@ { X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256m, 0 }, { X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZm, 0 }, { X86::VFNMSUB132PDr, X86::VFNMSUB132PDm, 0 }, + { X86::VFNMSUB132PHZ128r, X86::VFNMSUB132PHZ128m, 0 }, + { X86::VFNMSUB132PHZ256r, X86::VFNMSUB132PHZ256m, 0 }, + { X86::VFNMSUB132PHZr, X86::VFNMSUB132PHZm, 0 }, { X86::VFNMSUB132PSYr, X86::VFNMSUB132PSYm, 0 }, { X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128m, 0 }, { X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256m, 0 }, @@ -3551,6 +3617,8 @@ { X86::VFNMSUB132SDZr_Int, X86::VFNMSUB132SDZm_Int, TB_NO_REVERSE }, { X86::VFNMSUB132SDr, X86::VFNMSUB132SDm, 0 }, { X86::VFNMSUB132SDr_Int, X86::VFNMSUB132SDm_Int, TB_NO_REVERSE }, + { X86::VFNMSUB132SHZr, X86::VFNMSUB132SHZm, 0 }, + { X86::VFNMSUB132SHZr_Int, X86::VFNMSUB132SHZm_Int, TB_NO_REVERSE }, { X86::VFNMSUB132SSZr, X86::VFNMSUB132SSZm, 0 }, { X86::VFNMSUB132SSZr_Int, X86::VFNMSUB132SSZm_Int, TB_NO_REVERSE }, { X86::VFNMSUB132SSr, X86::VFNMSUB132SSm, 0 }, @@ -3560,6 +3628,9 @@ { X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256m, 0 }, { X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZm, 0 }, { X86::VFNMSUB213PDr, X86::VFNMSUB213PDm, 0 }, + { X86::VFNMSUB213PHZ128r, X86::VFNMSUB213PHZ128m, 0 }, + { X86::VFNMSUB213PHZ256r, X86::VFNMSUB213PHZ256m, 0 }, + { X86::VFNMSUB213PHZr, X86::VFNMSUB213PHZm, 0 }, { X86::VFNMSUB213PSYr, X86::VFNMSUB213PSYm, 0 }, { X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128m, 0 }, { X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256m, 0 }, @@ -3569,6 +3640,8 @@ { X86::VFNMSUB213SDZr_Int, X86::VFNMSUB213SDZm_Int, TB_NO_REVERSE }, { X86::VFNMSUB213SDr, X86::VFNMSUB213SDm, 0 }, { X86::VFNMSUB213SDr_Int, X86::VFNMSUB213SDm_Int, TB_NO_REVERSE }, + { X86::VFNMSUB213SHZr, X86::VFNMSUB213SHZm, 0 }, + { X86::VFNMSUB213SHZr_Int, X86::VFNMSUB213SHZm_Int, TB_NO_REVERSE }, { X86::VFNMSUB213SSZr, X86::VFNMSUB213SSZm, 0 }, { X86::VFNMSUB213SSZr_Int, X86::VFNMSUB213SSZm_Int, TB_NO_REVERSE }, { X86::VFNMSUB213SSr, X86::VFNMSUB213SSm, 0 }, @@ -3578,6 +3651,9 @@ { X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256m, 0 }, { X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZm, 0 }, { X86::VFNMSUB231PDr, X86::VFNMSUB231PDm, 0 }, + { X86::VFNMSUB231PHZ128r, X86::VFNMSUB231PHZ128m, 0 }, + { X86::VFNMSUB231PHZ256r, X86::VFNMSUB231PHZ256m, 0 }, + { X86::VFNMSUB231PHZr, X86::VFNMSUB231PHZm, 0 }, { X86::VFNMSUB231PSYr, X86::VFNMSUB231PSYm, 0 }, { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128m, 0 }, { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256m, 0 }, @@ -3587,6 +3663,8 @@ { X86::VFNMSUB231SDZr_Int, X86::VFNMSUB231SDZm_Int, TB_NO_REVERSE }, { X86::VFNMSUB231SDr, X86::VFNMSUB231SDm, 0 }, { X86::VFNMSUB231SDr_Int, X86::VFNMSUB231SDm_Int, TB_NO_REVERSE }, + { X86::VFNMSUB231SHZr, X86::VFNMSUB231SHZm, 0 }, + { X86::VFNMSUB231SHZr_Int, X86::VFNMSUB231SHZm_Int, TB_NO_REVERSE }, { X86::VFNMSUB231SSZr, X86::VFNMSUB231SSZm, 0 }, { X86::VFNMSUB231SSZr_Int, X86::VFNMSUB231SSZm_Int, TB_NO_REVERSE }, { X86::VFNMSUB231SSr, X86::VFNMSUB231SSm, 0 }, @@ -4599,6 +4677,12 @@ { X86::VFMADD132PDZ256rkz, X86::VFMADD132PDZ256mkz, 0 }, { X86::VFMADD132PDZrk, X86::VFMADD132PDZmk, 0 }, { X86::VFMADD132PDZrkz, X86::VFMADD132PDZmkz, 0 }, + { X86::VFMADD132PHZ128rk, X86::VFMADD132PHZ128mk, 0 }, + { X86::VFMADD132PHZ128rkz, X86::VFMADD132PHZ128mkz, 0 }, + { X86::VFMADD132PHZ256rk, X86::VFMADD132PHZ256mk, 0 }, + { X86::VFMADD132PHZ256rkz, X86::VFMADD132PHZ256mkz, 0 }, + { X86::VFMADD132PHZrk, X86::VFMADD132PHZmk, 0 }, + { X86::VFMADD132PHZrkz, X86::VFMADD132PHZmkz, 0 }, { X86::VFMADD132PSZ128rk, X86::VFMADD132PSZ128mk, 0 }, { X86::VFMADD132PSZ128rkz, X86::VFMADD132PSZ128mkz, 0 }, { X86::VFMADD132PSZ256rk, X86::VFMADD132PSZ256mk, 0 }, @@ -4607,6 +4691,8 @@ { X86::VFMADD132PSZrkz, X86::VFMADD132PSZmkz, 0 }, { X86::VFMADD132SDZr_Intk, X86::VFMADD132SDZm_Intk, TB_NO_REVERSE }, { X86::VFMADD132SDZr_Intkz, X86::VFMADD132SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFMADD132SHZr_Intk, X86::VFMADD132SHZm_Intk, TB_NO_REVERSE }, + { X86::VFMADD132SHZr_Intkz, X86::VFMADD132SHZm_Intkz, TB_NO_REVERSE }, { X86::VFMADD132SSZr_Intk, X86::VFMADD132SSZm_Intk, TB_NO_REVERSE }, { X86::VFMADD132SSZr_Intkz, X86::VFMADD132SSZm_Intkz, TB_NO_REVERSE }, { X86::VFMADD213PDZ128rk, X86::VFMADD213PDZ128mk, 0 }, @@ -4615,6 +4701,12 @@ { X86::VFMADD213PDZ256rkz, X86::VFMADD213PDZ256mkz, 0 }, { X86::VFMADD213PDZrk, X86::VFMADD213PDZmk, 0 }, { X86::VFMADD213PDZrkz, X86::VFMADD213PDZmkz, 0 }, + { X86::VFMADD213PHZ128rk, X86::VFMADD213PHZ128mk, 0 }, + { X86::VFMADD213PHZ128rkz, X86::VFMADD213PHZ128mkz, 0 }, + { X86::VFMADD213PHZ256rk, X86::VFMADD213PHZ256mk, 0 }, + { X86::VFMADD213PHZ256rkz, X86::VFMADD213PHZ256mkz, 0 }, + { X86::VFMADD213PHZrk, X86::VFMADD213PHZmk, 0 }, + { X86::VFMADD213PHZrkz, X86::VFMADD213PHZmkz, 0 }, { X86::VFMADD213PSZ128rk, X86::VFMADD213PSZ128mk, 0 }, { X86::VFMADD213PSZ128rkz, X86::VFMADD213PSZ128mkz, 0 }, { X86::VFMADD213PSZ256rk, X86::VFMADD213PSZ256mk, 0 }, @@ -4623,6 +4715,8 @@ { X86::VFMADD213PSZrkz, X86::VFMADD213PSZmkz, 0 }, { X86::VFMADD213SDZr_Intk, X86::VFMADD213SDZm_Intk, TB_NO_REVERSE }, { X86::VFMADD213SDZr_Intkz, X86::VFMADD213SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFMADD213SHZr_Intk, X86::VFMADD213SHZm_Intk, TB_NO_REVERSE }, + { X86::VFMADD213SHZr_Intkz, X86::VFMADD213SHZm_Intkz, TB_NO_REVERSE }, { X86::VFMADD213SSZr_Intk, X86::VFMADD213SSZm_Intk, TB_NO_REVERSE }, { X86::VFMADD213SSZr_Intkz, X86::VFMADD213SSZm_Intkz, TB_NO_REVERSE }, { X86::VFMADD231PDZ128rk, X86::VFMADD231PDZ128mk, 0 }, @@ -4631,6 +4725,12 @@ { X86::VFMADD231PDZ256rkz, X86::VFMADD231PDZ256mkz, 0 }, { X86::VFMADD231PDZrk, X86::VFMADD231PDZmk, 0 }, { X86::VFMADD231PDZrkz, X86::VFMADD231PDZmkz, 0 }, + { X86::VFMADD231PHZ128rk, X86::VFMADD231PHZ128mk, 0 }, + { X86::VFMADD231PHZ128rkz, X86::VFMADD231PHZ128mkz, 0 }, + { X86::VFMADD231PHZ256rk, X86::VFMADD231PHZ256mk, 0 }, + { X86::VFMADD231PHZ256rkz, X86::VFMADD231PHZ256mkz, 0 }, + { X86::VFMADD231PHZrk, X86::VFMADD231PHZmk, 0 }, + { X86::VFMADD231PHZrkz, X86::VFMADD231PHZmkz, 0 }, { X86::VFMADD231PSZ128rk, X86::VFMADD231PSZ128mk, 0 }, { X86::VFMADD231PSZ128rkz, X86::VFMADD231PSZ128mkz, 0 }, { X86::VFMADD231PSZ256rk, X86::VFMADD231PSZ256mk, 0 }, @@ -4639,6 +4739,8 @@ { X86::VFMADD231PSZrkz, X86::VFMADD231PSZmkz, 0 }, { X86::VFMADD231SDZr_Intk, X86::VFMADD231SDZm_Intk, TB_NO_REVERSE }, { X86::VFMADD231SDZr_Intkz, X86::VFMADD231SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFMADD231SHZr_Intk, X86::VFMADD231SHZm_Intk, TB_NO_REVERSE }, + { X86::VFMADD231SHZr_Intkz, X86::VFMADD231SHZm_Intkz, TB_NO_REVERSE }, { X86::VFMADD231SSZr_Intk, X86::VFMADD231SSZm_Intk, TB_NO_REVERSE }, { X86::VFMADD231SSZr_Intkz, X86::VFMADD231SSZm_Intkz, TB_NO_REVERSE }, { X86::VFMADDSUB132PDZ128rk, X86::VFMADDSUB132PDZ128mk, 0 }, @@ -4647,6 +4749,12 @@ { X86::VFMADDSUB132PDZ256rkz, X86::VFMADDSUB132PDZ256mkz, 0 }, { X86::VFMADDSUB132PDZrk, X86::VFMADDSUB132PDZmk, 0 }, { X86::VFMADDSUB132PDZrkz, X86::VFMADDSUB132PDZmkz, 0 }, + { X86::VFMADDSUB132PHZ128rk, X86::VFMADDSUB132PHZ128mk, 0 }, + { X86::VFMADDSUB132PHZ128rkz, X86::VFMADDSUB132PHZ128mkz, 0 }, + { X86::VFMADDSUB132PHZ256rk, X86::VFMADDSUB132PHZ256mk, 0 }, + { X86::VFMADDSUB132PHZ256rkz, X86::VFMADDSUB132PHZ256mkz, 0 }, + { X86::VFMADDSUB132PHZrk, X86::VFMADDSUB132PHZmk, 0 }, + { X86::VFMADDSUB132PHZrkz, X86::VFMADDSUB132PHZmkz, 0 }, { X86::VFMADDSUB132PSZ128rk, X86::VFMADDSUB132PSZ128mk, 0 }, { X86::VFMADDSUB132PSZ128rkz, X86::VFMADDSUB132PSZ128mkz, 0 }, { X86::VFMADDSUB132PSZ256rk, X86::VFMADDSUB132PSZ256mk, 0 }, @@ -4659,6 +4767,12 @@ { X86::VFMADDSUB213PDZ256rkz, X86::VFMADDSUB213PDZ256mkz, 0 }, { X86::VFMADDSUB213PDZrk, X86::VFMADDSUB213PDZmk, 0 }, { X86::VFMADDSUB213PDZrkz, X86::VFMADDSUB213PDZmkz, 0 }, + { X86::VFMADDSUB213PHZ128rk, X86::VFMADDSUB213PHZ128mk, 0 }, + { X86::VFMADDSUB213PHZ128rkz, X86::VFMADDSUB213PHZ128mkz, 0 }, + { X86::VFMADDSUB213PHZ256rk, X86::VFMADDSUB213PHZ256mk, 0 }, + { X86::VFMADDSUB213PHZ256rkz, X86::VFMADDSUB213PHZ256mkz, 0 }, + { X86::VFMADDSUB213PHZrk, X86::VFMADDSUB213PHZmk, 0 }, + { X86::VFMADDSUB213PHZrkz, X86::VFMADDSUB213PHZmkz, 0 }, { X86::VFMADDSUB213PSZ128rk, X86::VFMADDSUB213PSZ128mk, 0 }, { X86::VFMADDSUB213PSZ128rkz, X86::VFMADDSUB213PSZ128mkz, 0 }, { X86::VFMADDSUB213PSZ256rk, X86::VFMADDSUB213PSZ256mk, 0 }, @@ -4671,6 +4785,12 @@ { X86::VFMADDSUB231PDZ256rkz, X86::VFMADDSUB231PDZ256mkz, 0 }, { X86::VFMADDSUB231PDZrk, X86::VFMADDSUB231PDZmk, 0 }, { X86::VFMADDSUB231PDZrkz, X86::VFMADDSUB231PDZmkz, 0 }, + { X86::VFMADDSUB231PHZ128rk, X86::VFMADDSUB231PHZ128mk, 0 }, + { X86::VFMADDSUB231PHZ128rkz, X86::VFMADDSUB231PHZ128mkz, 0 }, + { X86::VFMADDSUB231PHZ256rk, X86::VFMADDSUB231PHZ256mk, 0 }, + { X86::VFMADDSUB231PHZ256rkz, X86::VFMADDSUB231PHZ256mkz, 0 }, + { X86::VFMADDSUB231PHZrk, X86::VFMADDSUB231PHZmk, 0 }, + { X86::VFMADDSUB231PHZrkz, X86::VFMADDSUB231PHZmkz, 0 }, { X86::VFMADDSUB231PSZ128rk, X86::VFMADDSUB231PSZ128mk, 0 }, { X86::VFMADDSUB231PSZ128rkz, X86::VFMADDSUB231PSZ128mkz, 0 }, { X86::VFMADDSUB231PSZ256rk, X86::VFMADDSUB231PSZ256mk, 0 }, @@ -4683,6 +4803,12 @@ { X86::VFMSUB132PDZ256rkz, X86::VFMSUB132PDZ256mkz, 0 }, { X86::VFMSUB132PDZrk, X86::VFMSUB132PDZmk, 0 }, { X86::VFMSUB132PDZrkz, X86::VFMSUB132PDZmkz, 0 }, + { X86::VFMSUB132PHZ128rk, X86::VFMSUB132PHZ128mk, 0 }, + { X86::VFMSUB132PHZ128rkz, X86::VFMSUB132PHZ128mkz, 0 }, + { X86::VFMSUB132PHZ256rk, X86::VFMSUB132PHZ256mk, 0 }, + { X86::VFMSUB132PHZ256rkz, X86::VFMSUB132PHZ256mkz, 0 }, + { X86::VFMSUB132PHZrk, X86::VFMSUB132PHZmk, 0 }, + { X86::VFMSUB132PHZrkz, X86::VFMSUB132PHZmkz, 0 }, { X86::VFMSUB132PSZ128rk, X86::VFMSUB132PSZ128mk, 0 }, { X86::VFMSUB132PSZ128rkz, X86::VFMSUB132PSZ128mkz, 0 }, { X86::VFMSUB132PSZ256rk, X86::VFMSUB132PSZ256mk, 0 }, @@ -4691,6 +4817,8 @@ { X86::VFMSUB132PSZrkz, X86::VFMSUB132PSZmkz, 0 }, { X86::VFMSUB132SDZr_Intk, X86::VFMSUB132SDZm_Intk, TB_NO_REVERSE }, { X86::VFMSUB132SDZr_Intkz, X86::VFMSUB132SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFMSUB132SHZr_Intk, X86::VFMSUB132SHZm_Intk, TB_NO_REVERSE }, + { X86::VFMSUB132SHZr_Intkz, X86::VFMSUB132SHZm_Intkz, TB_NO_REVERSE }, { X86::VFMSUB132SSZr_Intk, X86::VFMSUB132SSZm_Intk, TB_NO_REVERSE }, { X86::VFMSUB132SSZr_Intkz, X86::VFMSUB132SSZm_Intkz, TB_NO_REVERSE }, { X86::VFMSUB213PDZ128rk, X86::VFMSUB213PDZ128mk, 0 }, @@ -4699,6 +4827,12 @@ { X86::VFMSUB213PDZ256rkz, X86::VFMSUB213PDZ256mkz, 0 }, { X86::VFMSUB213PDZrk, X86::VFMSUB213PDZmk, 0 }, { X86::VFMSUB213PDZrkz, X86::VFMSUB213PDZmkz, 0 }, + { X86::VFMSUB213PHZ128rk, X86::VFMSUB213PHZ128mk, 0 }, + { X86::VFMSUB213PHZ128rkz, X86::VFMSUB213PHZ128mkz, 0 }, + { X86::VFMSUB213PHZ256rk, X86::VFMSUB213PHZ256mk, 0 }, + { X86::VFMSUB213PHZ256rkz, X86::VFMSUB213PHZ256mkz, 0 }, + { X86::VFMSUB213PHZrk, X86::VFMSUB213PHZmk, 0 }, + { X86::VFMSUB213PHZrkz, X86::VFMSUB213PHZmkz, 0 }, { X86::VFMSUB213PSZ128rk, X86::VFMSUB213PSZ128mk, 0 }, { X86::VFMSUB213PSZ128rkz, X86::VFMSUB213PSZ128mkz, 0 }, { X86::VFMSUB213PSZ256rk, X86::VFMSUB213PSZ256mk, 0 }, @@ -4707,6 +4841,8 @@ { X86::VFMSUB213PSZrkz, X86::VFMSUB213PSZmkz, 0 }, { X86::VFMSUB213SDZr_Intk, X86::VFMSUB213SDZm_Intk, TB_NO_REVERSE }, { X86::VFMSUB213SDZr_Intkz, X86::VFMSUB213SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFMSUB213SHZr_Intk, X86::VFMSUB213SHZm_Intk, TB_NO_REVERSE }, + { X86::VFMSUB213SHZr_Intkz, X86::VFMSUB213SHZm_Intkz, TB_NO_REVERSE }, { X86::VFMSUB213SSZr_Intk, X86::VFMSUB213SSZm_Intk, TB_NO_REVERSE }, { X86::VFMSUB213SSZr_Intkz, X86::VFMSUB213SSZm_Intkz, TB_NO_REVERSE }, { X86::VFMSUB231PDZ128rk, X86::VFMSUB231PDZ128mk, 0 }, @@ -4715,6 +4851,12 @@ { X86::VFMSUB231PDZ256rkz, X86::VFMSUB231PDZ256mkz, 0 }, { X86::VFMSUB231PDZrk, X86::VFMSUB231PDZmk, 0 }, { X86::VFMSUB231PDZrkz, X86::VFMSUB231PDZmkz, 0 }, + { X86::VFMSUB231PHZ128rk, X86::VFMSUB231PHZ128mk, 0 }, + { X86::VFMSUB231PHZ128rkz, X86::VFMSUB231PHZ128mkz, 0 }, + { X86::VFMSUB231PHZ256rk, X86::VFMSUB231PHZ256mk, 0 }, + { X86::VFMSUB231PHZ256rkz, X86::VFMSUB231PHZ256mkz, 0 }, + { X86::VFMSUB231PHZrk, X86::VFMSUB231PHZmk, 0 }, + { X86::VFMSUB231PHZrkz, X86::VFMSUB231PHZmkz, 0 }, { X86::VFMSUB231PSZ128rk, X86::VFMSUB231PSZ128mk, 0 }, { X86::VFMSUB231PSZ128rkz, X86::VFMSUB231PSZ128mkz, 0 }, { X86::VFMSUB231PSZ256rk, X86::VFMSUB231PSZ256mk, 0 }, @@ -4723,6 +4865,8 @@ { X86::VFMSUB231PSZrkz, X86::VFMSUB231PSZmkz, 0 }, { X86::VFMSUB231SDZr_Intk, X86::VFMSUB231SDZm_Intk, TB_NO_REVERSE }, { X86::VFMSUB231SDZr_Intkz, X86::VFMSUB231SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFMSUB231SHZr_Intk, X86::VFMSUB231SHZm_Intk, TB_NO_REVERSE }, + { X86::VFMSUB231SHZr_Intkz, X86::VFMSUB231SHZm_Intkz, TB_NO_REVERSE }, { X86::VFMSUB231SSZr_Intk, X86::VFMSUB231SSZm_Intk, TB_NO_REVERSE }, { X86::VFMSUB231SSZr_Intkz, X86::VFMSUB231SSZm_Intkz, TB_NO_REVERSE }, { X86::VFMSUBADD132PDZ128rk, X86::VFMSUBADD132PDZ128mk, 0 }, @@ -4731,6 +4875,12 @@ { X86::VFMSUBADD132PDZ256rkz, X86::VFMSUBADD132PDZ256mkz, 0 }, { X86::VFMSUBADD132PDZrk, X86::VFMSUBADD132PDZmk, 0 }, { X86::VFMSUBADD132PDZrkz, X86::VFMSUBADD132PDZmkz, 0 }, + { X86::VFMSUBADD132PHZ128rk, X86::VFMSUBADD132PHZ128mk, 0 }, + { X86::VFMSUBADD132PHZ128rkz, X86::VFMSUBADD132PHZ128mkz, 0 }, + { X86::VFMSUBADD132PHZ256rk, X86::VFMSUBADD132PHZ256mk, 0 }, + { X86::VFMSUBADD132PHZ256rkz, X86::VFMSUBADD132PHZ256mkz, 0 }, + { X86::VFMSUBADD132PHZrk, X86::VFMSUBADD132PHZmk, 0 }, + { X86::VFMSUBADD132PHZrkz, X86::VFMSUBADD132PHZmkz, 0 }, { X86::VFMSUBADD132PSZ128rk, X86::VFMSUBADD132PSZ128mk, 0 }, { X86::VFMSUBADD132PSZ128rkz, X86::VFMSUBADD132PSZ128mkz, 0 }, { X86::VFMSUBADD132PSZ256rk, X86::VFMSUBADD132PSZ256mk, 0 }, @@ -4743,6 +4893,12 @@ { X86::VFMSUBADD213PDZ256rkz, X86::VFMSUBADD213PDZ256mkz, 0 }, { X86::VFMSUBADD213PDZrk, X86::VFMSUBADD213PDZmk, 0 }, { X86::VFMSUBADD213PDZrkz, X86::VFMSUBADD213PDZmkz, 0 }, + { X86::VFMSUBADD213PHZ128rk, X86::VFMSUBADD213PHZ128mk, 0 }, + { X86::VFMSUBADD213PHZ128rkz, X86::VFMSUBADD213PHZ128mkz, 0 }, + { X86::VFMSUBADD213PHZ256rk, X86::VFMSUBADD213PHZ256mk, 0 }, + { X86::VFMSUBADD213PHZ256rkz, X86::VFMSUBADD213PHZ256mkz, 0 }, + { X86::VFMSUBADD213PHZrk, X86::VFMSUBADD213PHZmk, 0 }, + { X86::VFMSUBADD213PHZrkz, X86::VFMSUBADD213PHZmkz, 0 }, { X86::VFMSUBADD213PSZ128rk, X86::VFMSUBADD213PSZ128mk, 0 }, { X86::VFMSUBADD213PSZ128rkz, X86::VFMSUBADD213PSZ128mkz, 0 }, { X86::VFMSUBADD213PSZ256rk, X86::VFMSUBADD213PSZ256mk, 0 }, @@ -4755,6 +4911,12 @@ { X86::VFMSUBADD231PDZ256rkz, X86::VFMSUBADD231PDZ256mkz, 0 }, { X86::VFMSUBADD231PDZrk, X86::VFMSUBADD231PDZmk, 0 }, { X86::VFMSUBADD231PDZrkz, X86::VFMSUBADD231PDZmkz, 0 }, + { X86::VFMSUBADD231PHZ128rk, X86::VFMSUBADD231PHZ128mk, 0 }, + { X86::VFMSUBADD231PHZ128rkz, X86::VFMSUBADD231PHZ128mkz, 0 }, + { X86::VFMSUBADD231PHZ256rk, X86::VFMSUBADD231PHZ256mk, 0 }, + { X86::VFMSUBADD231PHZ256rkz, X86::VFMSUBADD231PHZ256mkz, 0 }, + { X86::VFMSUBADD231PHZrk, X86::VFMSUBADD231PHZmk, 0 }, + { X86::VFMSUBADD231PHZrkz, X86::VFMSUBADD231PHZmkz, 0 }, { X86::VFMSUBADD231PSZ128rk, X86::VFMSUBADD231PSZ128mk, 0 }, { X86::VFMSUBADD231PSZ128rkz, X86::VFMSUBADD231PSZ128mkz, 0 }, { X86::VFMSUBADD231PSZ256rk, X86::VFMSUBADD231PSZ256mk, 0 }, @@ -4767,6 +4929,12 @@ { X86::VFNMADD132PDZ256rkz, X86::VFNMADD132PDZ256mkz, 0 }, { X86::VFNMADD132PDZrk, X86::VFNMADD132PDZmk, 0 }, { X86::VFNMADD132PDZrkz, X86::VFNMADD132PDZmkz, 0 }, + { X86::VFNMADD132PHZ128rk, X86::VFNMADD132PHZ128mk, 0 }, + { X86::VFNMADD132PHZ128rkz, X86::VFNMADD132PHZ128mkz, 0 }, + { X86::VFNMADD132PHZ256rk, X86::VFNMADD132PHZ256mk, 0 }, + { X86::VFNMADD132PHZ256rkz, X86::VFNMADD132PHZ256mkz, 0 }, + { X86::VFNMADD132PHZrk, X86::VFNMADD132PHZmk, 0 }, + { X86::VFNMADD132PHZrkz, X86::VFNMADD132PHZmkz, 0 }, { X86::VFNMADD132PSZ128rk, X86::VFNMADD132PSZ128mk, 0 }, { X86::VFNMADD132PSZ128rkz, X86::VFNMADD132PSZ128mkz, 0 }, { X86::VFNMADD132PSZ256rk, X86::VFNMADD132PSZ256mk, 0 }, @@ -4775,6 +4943,8 @@ { X86::VFNMADD132PSZrkz, X86::VFNMADD132PSZmkz, 0 }, { X86::VFNMADD132SDZr_Intk, X86::VFNMADD132SDZm_Intk, TB_NO_REVERSE }, { X86::VFNMADD132SDZr_Intkz, X86::VFNMADD132SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFNMADD132SHZr_Intk, X86::VFNMADD132SHZm_Intk, TB_NO_REVERSE }, + { X86::VFNMADD132SHZr_Intkz, X86::VFNMADD132SHZm_Intkz, TB_NO_REVERSE }, { X86::VFNMADD132SSZr_Intk, X86::VFNMADD132SSZm_Intk, TB_NO_REVERSE }, { X86::VFNMADD132SSZr_Intkz, X86::VFNMADD132SSZm_Intkz, TB_NO_REVERSE }, { X86::VFNMADD213PDZ128rk, X86::VFNMADD213PDZ128mk, 0 }, @@ -4783,6 +4953,12 @@ { X86::VFNMADD213PDZ256rkz, X86::VFNMADD213PDZ256mkz, 0 }, { X86::VFNMADD213PDZrk, X86::VFNMADD213PDZmk, 0 }, { X86::VFNMADD213PDZrkz, X86::VFNMADD213PDZmkz, 0 }, + { X86::VFNMADD213PHZ128rk, X86::VFNMADD213PHZ128mk, 0 }, + { X86::VFNMADD213PHZ128rkz, X86::VFNMADD213PHZ128mkz, 0 }, + { X86::VFNMADD213PHZ256rk, X86::VFNMADD213PHZ256mk, 0 }, + { X86::VFNMADD213PHZ256rkz, X86::VFNMADD213PHZ256mkz, 0 }, + { X86::VFNMADD213PHZrk, X86::VFNMADD213PHZmk, 0 }, + { X86::VFNMADD213PHZrkz, X86::VFNMADD213PHZmkz, 0 }, { X86::VFNMADD213PSZ128rk, X86::VFNMADD213PSZ128mk, 0 }, { X86::VFNMADD213PSZ128rkz, X86::VFNMADD213PSZ128mkz, 0 }, { X86::VFNMADD213PSZ256rk, X86::VFNMADD213PSZ256mk, 0 }, @@ -4791,6 +4967,8 @@ { X86::VFNMADD213PSZrkz, X86::VFNMADD213PSZmkz, 0 }, { X86::VFNMADD213SDZr_Intk, X86::VFNMADD213SDZm_Intk, TB_NO_REVERSE }, { X86::VFNMADD213SDZr_Intkz, X86::VFNMADD213SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFNMADD213SHZr_Intk, X86::VFNMADD213SHZm_Intk, TB_NO_REVERSE }, + { X86::VFNMADD213SHZr_Intkz, X86::VFNMADD213SHZm_Intkz, TB_NO_REVERSE }, { X86::VFNMADD213SSZr_Intk, X86::VFNMADD213SSZm_Intk, TB_NO_REVERSE }, { X86::VFNMADD213SSZr_Intkz, X86::VFNMADD213SSZm_Intkz, TB_NO_REVERSE }, { X86::VFNMADD231PDZ128rk, X86::VFNMADD231PDZ128mk, 0 }, @@ -4799,6 +4977,12 @@ { X86::VFNMADD231PDZ256rkz, X86::VFNMADD231PDZ256mkz, 0 }, { X86::VFNMADD231PDZrk, X86::VFNMADD231PDZmk, 0 }, { X86::VFNMADD231PDZrkz, X86::VFNMADD231PDZmkz, 0 }, + { X86::VFNMADD231PHZ128rk, X86::VFNMADD231PHZ128mk, 0 }, + { X86::VFNMADD231PHZ128rkz, X86::VFNMADD231PHZ128mkz, 0 }, + { X86::VFNMADD231PHZ256rk, X86::VFNMADD231PHZ256mk, 0 }, + { X86::VFNMADD231PHZ256rkz, X86::VFNMADD231PHZ256mkz, 0 }, + { X86::VFNMADD231PHZrk, X86::VFNMADD231PHZmk, 0 }, + { X86::VFNMADD231PHZrkz, X86::VFNMADD231PHZmkz, 0 }, { X86::VFNMADD231PSZ128rk, X86::VFNMADD231PSZ128mk, 0 }, { X86::VFNMADD231PSZ128rkz, X86::VFNMADD231PSZ128mkz, 0 }, { X86::VFNMADD231PSZ256rk, X86::VFNMADD231PSZ256mk, 0 }, @@ -4807,6 +4991,8 @@ { X86::VFNMADD231PSZrkz, X86::VFNMADD231PSZmkz, 0 }, { X86::VFNMADD231SDZr_Intk, X86::VFNMADD231SDZm_Intk, TB_NO_REVERSE }, { X86::VFNMADD231SDZr_Intkz, X86::VFNMADD231SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFNMADD231SHZr_Intk, X86::VFNMADD231SHZm_Intk, TB_NO_REVERSE }, + { X86::VFNMADD231SHZr_Intkz, X86::VFNMADD231SHZm_Intkz, TB_NO_REVERSE }, { X86::VFNMADD231SSZr_Intk, X86::VFNMADD231SSZm_Intk, TB_NO_REVERSE }, { X86::VFNMADD231SSZr_Intkz, X86::VFNMADD231SSZm_Intkz, TB_NO_REVERSE }, { X86::VFNMSUB132PDZ128rk, X86::VFNMSUB132PDZ128mk, 0 }, @@ -4815,6 +5001,12 @@ { X86::VFNMSUB132PDZ256rkz, X86::VFNMSUB132PDZ256mkz, 0 }, { X86::VFNMSUB132PDZrk, X86::VFNMSUB132PDZmk, 0 }, { X86::VFNMSUB132PDZrkz, X86::VFNMSUB132PDZmkz, 0 }, + { X86::VFNMSUB132PHZ128rk, X86::VFNMSUB132PHZ128mk, 0 }, + { X86::VFNMSUB132PHZ128rkz, X86::VFNMSUB132PHZ128mkz, 0 }, + { X86::VFNMSUB132PHZ256rk, X86::VFNMSUB132PHZ256mk, 0 }, + { X86::VFNMSUB132PHZ256rkz, X86::VFNMSUB132PHZ256mkz, 0 }, + { X86::VFNMSUB132PHZrk, X86::VFNMSUB132PHZmk, 0 }, + { X86::VFNMSUB132PHZrkz, X86::VFNMSUB132PHZmkz, 0 }, { X86::VFNMSUB132PSZ128rk, X86::VFNMSUB132PSZ128mk, 0 }, { X86::VFNMSUB132PSZ128rkz, X86::VFNMSUB132PSZ128mkz, 0 }, { X86::VFNMSUB132PSZ256rk, X86::VFNMSUB132PSZ256mk, 0 }, @@ -4823,6 +5015,8 @@ { X86::VFNMSUB132PSZrkz, X86::VFNMSUB132PSZmkz, 0 }, { X86::VFNMSUB132SDZr_Intk, X86::VFNMSUB132SDZm_Intk, TB_NO_REVERSE }, { X86::VFNMSUB132SDZr_Intkz, X86::VFNMSUB132SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFNMSUB132SHZr_Intk, X86::VFNMSUB132SHZm_Intk, TB_NO_REVERSE }, + { X86::VFNMSUB132SHZr_Intkz, X86::VFNMSUB132SHZm_Intkz, TB_NO_REVERSE }, { X86::VFNMSUB132SSZr_Intk, X86::VFNMSUB132SSZm_Intk, TB_NO_REVERSE }, { X86::VFNMSUB132SSZr_Intkz, X86::VFNMSUB132SSZm_Intkz, TB_NO_REVERSE }, { X86::VFNMSUB213PDZ128rk, X86::VFNMSUB213PDZ128mk, 0 }, @@ -4831,6 +5025,12 @@ { X86::VFNMSUB213PDZ256rkz, X86::VFNMSUB213PDZ256mkz, 0 }, { X86::VFNMSUB213PDZrk, X86::VFNMSUB213PDZmk, 0 }, { X86::VFNMSUB213PDZrkz, X86::VFNMSUB213PDZmkz, 0 }, + { X86::VFNMSUB213PHZ128rk, X86::VFNMSUB213PHZ128mk, 0 }, + { X86::VFNMSUB213PHZ128rkz, X86::VFNMSUB213PHZ128mkz, 0 }, + { X86::VFNMSUB213PHZ256rk, X86::VFNMSUB213PHZ256mk, 0 }, + { X86::VFNMSUB213PHZ256rkz, X86::VFNMSUB213PHZ256mkz, 0 }, + { X86::VFNMSUB213PHZrk, X86::VFNMSUB213PHZmk, 0 }, + { X86::VFNMSUB213PHZrkz, X86::VFNMSUB213PHZmkz, 0 }, { X86::VFNMSUB213PSZ128rk, X86::VFNMSUB213PSZ128mk, 0 }, { X86::VFNMSUB213PSZ128rkz, X86::VFNMSUB213PSZ128mkz, 0 }, { X86::VFNMSUB213PSZ256rk, X86::VFNMSUB213PSZ256mk, 0 }, @@ -4839,6 +5039,8 @@ { X86::VFNMSUB213PSZrkz, X86::VFNMSUB213PSZmkz, 0 }, { X86::VFNMSUB213SDZr_Intk, X86::VFNMSUB213SDZm_Intk, TB_NO_REVERSE }, { X86::VFNMSUB213SDZr_Intkz, X86::VFNMSUB213SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFNMSUB213SHZr_Intk, X86::VFNMSUB213SHZm_Intk, TB_NO_REVERSE }, + { X86::VFNMSUB213SHZr_Intkz, X86::VFNMSUB213SHZm_Intkz, TB_NO_REVERSE }, { X86::VFNMSUB213SSZr_Intk, X86::VFNMSUB213SSZm_Intk, TB_NO_REVERSE }, { X86::VFNMSUB213SSZr_Intkz, X86::VFNMSUB213SSZm_Intkz, TB_NO_REVERSE }, { X86::VFNMSUB231PDZ128rk, X86::VFNMSUB231PDZ128mk, 0 }, @@ -4847,6 +5049,12 @@ { X86::VFNMSUB231PDZ256rkz, X86::VFNMSUB231PDZ256mkz, 0 }, { X86::VFNMSUB231PDZrk, X86::VFNMSUB231PDZmk, 0 }, { X86::VFNMSUB231PDZrkz, X86::VFNMSUB231PDZmkz, 0 }, + { X86::VFNMSUB231PHZ128rk, X86::VFNMSUB231PHZ128mk, 0 }, + { X86::VFNMSUB231PHZ128rkz, X86::VFNMSUB231PHZ128mkz, 0 }, + { X86::VFNMSUB231PHZ256rk, X86::VFNMSUB231PHZ256mk, 0 }, + { X86::VFNMSUB231PHZ256rkz, X86::VFNMSUB231PHZ256mkz, 0 }, + { X86::VFNMSUB231PHZrk, X86::VFNMSUB231PHZmk, 0 }, + { X86::VFNMSUB231PHZrkz, X86::VFNMSUB231PHZmkz, 0 }, { X86::VFNMSUB231PSZ128rk, X86::VFNMSUB231PSZ128mk, 0 }, { X86::VFNMSUB231PSZ128rkz, X86::VFNMSUB231PSZ128mkz, 0 }, { X86::VFNMSUB231PSZ256rk, X86::VFNMSUB231PSZ256mk, 0 }, @@ -4855,6 +5063,8 @@ { X86::VFNMSUB231PSZrkz, X86::VFNMSUB231PSZmkz, 0 }, { X86::VFNMSUB231SDZr_Intk, X86::VFNMSUB231SDZm_Intk, TB_NO_REVERSE }, { X86::VFNMSUB231SDZr_Intkz, X86::VFNMSUB231SDZm_Intkz, TB_NO_REVERSE }, + { X86::VFNMSUB231SHZr_Intk, X86::VFNMSUB231SHZm_Intk, TB_NO_REVERSE }, + { X86::VFNMSUB231SHZr_Intkz, X86::VFNMSUB231SHZm_Intkz, TB_NO_REVERSE }, { X86::VFNMSUB231SSZr_Intk, X86::VFNMSUB231SSZm_Intk, TB_NO_REVERSE }, { X86::VFNMSUB231SSZr_Intkz, X86::VFNMSUB231SSZm_Intkz, TB_NO_REVERSE }, { X86::VGETEXPSDZrk, X86::VGETEXPSDZmk, TB_NO_REVERSE }, diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td --- a/llvm/lib/Target/X86/X86InstrFormats.td +++ b/llvm/lib/Target/X86/X86InstrFormats.td @@ -882,7 +882,6 @@ listpattern> : I, T8PD, EVEX_4V, Requires<[HasAVX512]>; -class AVX512FMA3Base : T8PD, EVEX_4V; class AVX512 o, Format F, dag outs, dag ins, string asm, listpattern> diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -6137,6 +6137,24 @@ case X86::VMINSHZrr_Intk: case X86::VMINSHZrr_Intkz: case X86::VMULSHZrr_Intk: case X86::VMULSHZrr_Intkz: case X86::VSUBSHZrr_Intk: case X86::VSUBSHZrr_Intkz: + case X86::VFMADD132SHZr_Int: case X86::VFNMADD132SHZr_Int: + case X86::VFMADD213SHZr_Int: case X86::VFNMADD213SHZr_Int: + case X86::VFMADD231SHZr_Int: case X86::VFNMADD231SHZr_Int: + case X86::VFMSUB132SHZr_Int: case X86::VFNMSUB132SHZr_Int: + case X86::VFMSUB213SHZr_Int: case X86::VFNMSUB213SHZr_Int: + case X86::VFMSUB231SHZr_Int: case X86::VFNMSUB231SHZr_Int: + case X86::VFMADD132SHZr_Intk: case X86::VFNMADD132SHZr_Intk: + case X86::VFMADD213SHZr_Intk: case X86::VFNMADD213SHZr_Intk: + case X86::VFMADD231SHZr_Intk: case X86::VFNMADD231SHZr_Intk: + case X86::VFMSUB132SHZr_Intk: case X86::VFNMSUB132SHZr_Intk: + case X86::VFMSUB213SHZr_Intk: case X86::VFNMSUB213SHZr_Intk: + case X86::VFMSUB231SHZr_Intk: case X86::VFNMSUB231SHZr_Intk: + case X86::VFMADD132SHZr_Intkz: case X86::VFNMADD132SHZr_Intkz: + case X86::VFMADD213SHZr_Intkz: case X86::VFNMADD213SHZr_Intkz: + case X86::VFMADD231SHZr_Intkz: case X86::VFNMADD231SHZr_Intkz: + case X86::VFMSUB132SHZr_Intkz: case X86::VFNMSUB132SHZr_Intkz: + case X86::VFMSUB213SHZr_Intkz: case X86::VFNMSUB213SHZr_Intkz: + case X86::VFMSUB231SHZr_Intkz: case X86::VFNMSUB231SHZr_Intkz: return false; default: return true; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1187,6 +1187,12 @@ X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512fp16_vcvtusi642sh, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512fp16_vfmadd_f16, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512fp16_vfmadd_ph_512, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512fp16_vfmaddsub_ph_128, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512fp16_vfmaddsub_ph_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512fp16_vfmaddsub_ph_512, INTR_TYPE_3OP, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), diff --git a/llvm/test/CodeGen/X86/avx512fp16-fma-commute.ll b/llvm/test/CodeGen/X86/avx512fp16-fma-commute.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fma-commute.ll @@ -0,0 +1,1363 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s + +declare half @llvm.fma.f16(half, half, half) +declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) +declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>) +declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>) + +define half @fma_123_f16(half %x, half %y, half %z) { +; CHECK-LABEL: fma_123_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %a = call half @llvm.fma.f16(half %x, half %y, half %z) + ret half %a +} + +define half @fma_213_f16(half %x, half %y, half %z) { +; CHECK-LABEL: fma_213_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %a = call half @llvm.fma.f16(half %y, half %x, half %z) + ret half %a +} + +define half @fma_231_f16(half %x, half %y, half %z) { +; CHECK-LABEL: fma_231_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231sh %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call half @llvm.fma.f16(half %y, half %z, half %x) + ret half %a +} + +define half @fma_321_f16(half %x, half %y, half %z) { +; CHECK-LABEL: fma_321_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231sh %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call half @llvm.fma.f16(half %z, half %y, half %x) + ret half %a +} + +define half @fma_132_f16(half %x, half %y, half %z) { +; CHECK-LABEL: fma_132_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213sh %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call half @llvm.fma.f16(half %x, half %z, half %y) + ret half %a +} + +define half @fma_312_f16(half %x, half %y, half %z) { +; CHECK-LABEL: fma_312_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213sh %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call half @llvm.fma.f16(half %z, half %x, half %y) + ret half %a +} + +define half @fma_load_123_f16(half %x, half %y, half* %zp) { +; CHECK-LABEL: fma_load_123_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213sh (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load half, half* %zp + %a = call half @llvm.fma.f16(half %x, half %y, half %z) + ret half %a +} + +define half @fma_load_213_f16(half %x, half %y, half* %zp) { +; CHECK-LABEL: fma_load_213_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213sh (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load half, half* %zp + %a = call half @llvm.fma.f16(half %y, half %x, half %z) + ret half %a +} + +define half @fma_load_231_f16(half %x, half %y, half* %zp) { +; CHECK-LABEL: fma_load_231_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231sh (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load half, half* %zp + %a = call half @llvm.fma.f16(half %y, half %z, half %x) + ret half %a +} + +define half @fma_load_321_f16(half %x, half %y, half* %zp) { +; CHECK-LABEL: fma_load_321_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231sh (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load half, half* %zp + %a = call half @llvm.fma.f16(half %z, half %y, half %x) + ret half %a +} + +define half @fma_load_132_f16(half %x, half %y, half* %zp) { +; CHECK-LABEL: fma_load_132_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132sh (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load half, half* %zp + %a = call half @llvm.fma.f16(half %x, half %z, half %y) + ret half %a +} + +define half @fma_load_312_f16(half %x, half %y, half* %zp) { +; CHECK-LABEL: fma_load_312_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132sh (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load half, half* %zp + %a = call half @llvm.fma.f16(half %z, half %x, half %y) + ret half %a +} + +define <8 x half> @fma_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) { +; CHECK-LABEL: fma_123_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) + ret <8 x half> %a +} + +define <8 x half> @fma_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) { +; CHECK-LABEL: fma_213_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z) + ret <8 x half> %a +} + +define <8 x half> @fma_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) { +; CHECK-LABEL: fma_231_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x) + ret <8 x half> %a +} + +define <8 x half> @fma_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) { +; CHECK-LABEL: fma_321_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x) + ret <8 x half> %a +} + +define <8 x half> @fma_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) { +; CHECK-LABEL: fma_132_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y) + ret <8 x half> %a +} + +define <8 x half> @fma_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) { +; CHECK-LABEL: fma_312_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y) + ret <8 x half> %a +} + +define <8 x half> @fma_load_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) { +; CHECK-LABEL: fma_load_123_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) + ret <8 x half> %a +} + +define <8 x half> @fma_load_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) { +; CHECK-LABEL: fma_load_213_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z) + ret <8 x half> %a +} + +define <8 x half> @fma_load_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) { +; CHECK-LABEL: fma_load_231_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x) + ret <8 x half> %a +} + +define <8 x half> @fma_load_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) { +; CHECK-LABEL: fma_load_321_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x) + ret <8 x half> %a +} + +define <8 x half> @fma_load_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) { +; CHECK-LABEL: fma_load_132_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y) + ret <8 x half> %a +} + +define <8 x half> @fma_load_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp) { +; CHECK-LABEL: fma_load_312_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y) + ret <8 x half> %a +} + +define <8 x half> @fma_mask_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_123_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132ph %xmm1, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_mask_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_213_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_mask_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_231_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_mask_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_321_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %xmm1, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_mask_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_132_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132ph %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_mask_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_mask_312_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %xmm1, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_123_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_213_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_231_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %xmm1, %xmm2, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_321_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %xmm1, %xmm2, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_132_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %xmm1, %xmm2, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z, i8 %mask) { +; CHECK-LABEL: fma_maskz_312_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %xmm1, %xmm2, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <8 x half> @fma_mask_load_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_123_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_mask_load_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_213_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_mask_load_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_231_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_mask_load_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_321_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_mask_load_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_132_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_mask_load_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_mask_load_312_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> %x + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_load_123_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_123_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %y, <8 x half> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_load_213_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_213_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %x, <8 x half> %z) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_load_231_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_231_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_load_321_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_321_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %y, <8 x half> %x) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_load_132_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_132_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %x, <8 x half> %z, <8 x half> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <8 x half> @fma_maskz_load_312_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %zp, i8 %mask) { +; CHECK-LABEL: fma_maskz_load_312_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <8 x half>, <8 x half>* %zp + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %z, <8 x half> %x, <8 x half> %y) + %b = bitcast i8 %mask to <8 x i1> + %c = select <8 x i1> %b, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <16 x half> @fma_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) { +; CHECK-LABEL: fma_123_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) + ret <16 x half> %a +} + +define <16 x half> @fma_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) { +; CHECK-LABEL: fma_213_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z) + ret <16 x half> %a +} + +define <16 x half> @fma_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) { +; CHECK-LABEL: fma_231_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x) + ret <16 x half> %a +} + +define <16 x half> @fma_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) { +; CHECK-LABEL: fma_321_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x) + ret <16 x half> %a +} + +define <16 x half> @fma_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) { +; CHECK-LABEL: fma_132_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y) + ret <16 x half> %a +} + +define <16 x half> @fma_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) { +; CHECK-LABEL: fma_312_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y) + ret <16 x half> %a +} + +define <16 x half> @fma_load_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) { +; CHECK-LABEL: fma_load_123_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) + ret <16 x half> %a +} + +define <16 x half> @fma_load_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) { +; CHECK-LABEL: fma_load_213_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z) + ret <16 x half> %a +} + +define <16 x half> @fma_load_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) { +; CHECK-LABEL: fma_load_231_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x) + ret <16 x half> %a +} + +define <16 x half> @fma_load_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) { +; CHECK-LABEL: fma_load_321_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x) + ret <16 x half> %a +} + +define <16 x half> @fma_load_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) { +; CHECK-LABEL: fma_load_132_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y) + ret <16 x half> %a +} + +define <16 x half> @fma_load_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp) { +; CHECK-LABEL: fma_load_312_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y) + ret <16 x half> %a +} + +define <16 x half> @fma_mask_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_123_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132ph %ymm1, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_mask_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_213_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_mask_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_231_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_mask_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_321_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %ymm1, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_mask_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_132_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132ph %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_mask_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_mask_312_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %ymm1, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_123_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_213_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_231_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %ymm1, %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_321_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %ymm1, %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_132_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %ymm1, %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z, i16 %mask) { +; CHECK-LABEL: fma_maskz_312_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %ymm1, %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <16 x half> @fma_mask_load_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_123_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_mask_load_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_213_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_mask_load_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_231_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_mask_load_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_321_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_mask_load_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_132_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_mask_load_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_mask_load_312_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> %x + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_load_123_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_123_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %y, <16 x half> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_load_213_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_213_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %x, <16 x half> %z) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_load_231_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_231_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %y, <16 x half> %z, <16 x half> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_load_321_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_321_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %y, <16 x half> %x) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_load_132_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_132_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %x, <16 x half> %z, <16 x half> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <16 x half> @fma_maskz_load_312_v16f16(<16 x half> %x, <16 x half> %y, <16 x half>* %zp, i16 %mask) { +; CHECK-LABEL: fma_maskz_load_312_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <16 x half>, <16 x half>* %zp + %a = call <16 x half> @llvm.fma.v16f16(<16 x half> %z, <16 x half> %x, <16 x half> %y) + %b = bitcast i16 %mask to <16 x i1> + %c = select <16 x i1> %b, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define <32 x half> @fma_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) { +; CHECK-LABEL: fma_123_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) + ret <32 x half> %a +} + +define <32 x half> @fma_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) { +; CHECK-LABEL: fma_213_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z) + ret <32 x half> %a +} + +define <32 x half> @fma_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) { +; CHECK-LABEL: fma_231_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x) + ret <32 x half> %a +} + +define <32 x half> @fma_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) { +; CHECK-LABEL: fma_321_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x) + ret <32 x half> %a +} + +define <32 x half> @fma_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) { +; CHECK-LABEL: fma_132_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y) + ret <32 x half> %a +} + +define <32 x half> @fma_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) { +; CHECK-LABEL: fma_312_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y) + ret <32 x half> %a +} + +define <32 x half> @fma_load_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) { +; CHECK-LABEL: fma_load_123_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) + ret <32 x half> %a +} + +define <32 x half> @fma_load_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) { +; CHECK-LABEL: fma_load_213_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z) + ret <32 x half> %a +} + +define <32 x half> @fma_load_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) { +; CHECK-LABEL: fma_load_231_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x) + ret <32 x half> %a +} + +define <32 x half> @fma_load_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) { +; CHECK-LABEL: fma_load_321_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x) + ret <32 x half> %a +} + +define <32 x half> @fma_load_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) { +; CHECK-LABEL: fma_load_132_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y) + ret <32 x half> %a +} + +define <32 x half> @fma_load_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp) { +; CHECK-LABEL: fma_load_312_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y) + ret <32 x half> %a +} + +define <32 x half> @fma_mask_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_123_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132ph %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_mask_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_213_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_mask_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_231_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_mask_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_321_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_mask_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_132_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd132ph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_mask_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_mask_312_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_123_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_213_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_231_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_321_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd231ph %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_132_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z, i32 %mask) { +; CHECK-LABEL: fma_maskz_312_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmadd213ph %zmm1, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} + +define <32 x half> @fma_mask_load_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_123_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_mask_load_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_213_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_mask_load_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_231_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_mask_load_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_321_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_mask_load_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_132_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_mask_load_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_mask_load_312_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> %x + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_load_123_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_123_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %y, <32 x half> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_load_213_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_213_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd213ph (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %x, <32 x half> %z) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_load_231_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_231_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %y, <32 x half> %z, <32 x half> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_load_321_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_321_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd231ph (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %y, <32 x half> %x) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_load_132_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_132_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %x, <32 x half> %z, <32 x half> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} + +define <32 x half> @fma_maskz_load_312_v32f16(<32 x half> %x, <32 x half> %y, <32 x half>* %zp, i32 %mask) { +; CHECK-LABEL: fma_maskz_load_312_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vfmadd132ph (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %z = load <32 x half>, <32 x half>* %zp + %a = call <32 x half> @llvm.fma.v32f16(<32 x half> %z, <32 x half> %x, <32 x half> %y) + %b = bitcast i32 %mask to <32 x i1> + %c = select <32 x i1> %b, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %c +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll @@ -0,0 +1,585 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 + + +declare <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half>, <32 x half>, <32 x half>, i32) + +define <32 x half> @test_x86_vfnmadd_ph_z(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { +; CHECK-LABEL: test_x86_vfnmadd_ph_z: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmadd213ph %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xac,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <32 x half> , %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %1, <32 x half> %a2) + ret <32 x half> %2 +} + +define <32 x half> @test_mask_vfnmadd_ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) { +; X86-LABEL: test_mask_vfnmadd_ph: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_vfnmadd_ph: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <32 x half> , %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %1, <32 x half> %a2) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @test_x86_vfnmsubph_z(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { +; CHECK-LABEL: test_x86_vfnmsubph_z: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmsub213ph %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xae,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <32 x half> , %a1 + %2 = fsub <32 x half> , %a2 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %1, <32 x half> %2) + ret <32 x half> %3 +} + +define <32 x half> @test_mask_vfnmsub_ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) { +; X86-LABEL: test_mask_vfnmsub_ph: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_vfnmsub_ph: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <32 x half> , %a1 + %2 = fsub <32 x half> , %a2 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %1, <32 x half> %2) + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %a0 + ret <32 x half> %5 +} + +define <32 x half> @test_x86_vfmaddsubph_z(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { +; CHECK-LABEL: test_x86_vfmaddsubph_z: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmaddsub213ph %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xa6,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) #2 + ret <32 x half> %res +} + +define <32 x half> @test_mask_fmaddsub_ph(<32 x half> %a, <32 x half> %b, <32 x half> %c, i32 %mask) { +; X86-LABEL: test_mask_fmaddsub_ph: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmaddsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x96,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_fmaddsub_ph: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmaddsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x96,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a, <32 x half> %b, <32 x half> %c, i32 4) + %bc = bitcast i32 %mask to <32 x i1> + %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a + ret <32 x half> %sel +} + +declare <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half>, <32 x half>, <32 x half>, i32) nounwind readnone + +define <32 x half>@test_int_x86_avx512_mask_vfmaddsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){ +; X86-LABEL: test_int_x86_avx512_mask_vfmaddsub_ph_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmaddsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x96,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vfmaddsub_ph_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmaddsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x96,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 4) + %bc = bitcast i32 %x3 to <32 x i1> + %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %x0 + ret <32 x half> %sel +} + +define <32 x half>@test_int_x86_avx512_mask3_vfmaddsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmaddsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb6,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmaddsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb6,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 4) + %bc = bitcast i32 %x3 to <32 x i1> + %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %x2 + ret <32 x half> %sel +} + +define <32 x half>@test_int_x86_avx512_maskz_vfmaddsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){ +; X86-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmaddsub213ph %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xc9,0xa6,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmaddsub213ph %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xc9,0xa6,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 4) + %bc = bitcast i32 %x3 to <32 x i1> + %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> zeroinitializer + ret <32 x half> %sel +} + +define <32 x half>@test_int_x86_avx512_mask3_vfmsubadd_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsubadd231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb7,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsubadd231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb7,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %neg = fneg <32 x half> %x2 + %res = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %neg, i32 4) + %bc = bitcast i32 %x3 to <32 x i1> + %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %x2 + ret <32 x half> %sel +} + +define <32 x half> @test_mask_round_vfmadd512_ph_rrb_rne(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) { +; X86-LABEL: test_mask_round_vfmadd512_ph_rrb_rne: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd132ph {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x19,0x98,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_round_vfmadd512_ph_rrb_rne: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd132ph {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x19,0x98,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 8) nounwind + %bc = bitcast i32 %mask to <32 x i1> + %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a0 + ret <32 x half> %sel +} + +define <32 x half> @test_mask_round_vfmadd512_ph_rrb_rtn(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) { +; X86-LABEL: test_mask_round_vfmadd512_ph_rrb_rtn: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd132ph {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x39,0x98,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_round_vfmadd512_ph_rrb_rtn: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd132ph {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x39,0x98,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 9) nounwind + %bc = bitcast i32 %mask to <32 x i1> + %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a0 + ret <32 x half> %sel +} + +define <32 x half> @test_mask_round_vfmadd512_ph_rrb_rtp(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) { +; X86-LABEL: test_mask_round_vfmadd512_ph_rrb_rtp: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd132ph {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x59,0x98,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_round_vfmadd512_ph_rrb_rtp: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd132ph {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x59,0x98,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 10) nounwind + %bc = bitcast i32 %mask to <32 x i1> + %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a0 + ret <32 x half> %sel +} + +define <32 x half> @test_mask_round_vfmadd512_ph_rrb_rtz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) { +; X86-LABEL: test_mask_round_vfmadd512_ph_rrb_rtz: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd132ph {rz-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x79,0x98,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_round_vfmadd512_ph_rrb_rtz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd132ph {rz-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x79,0x98,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 11) nounwind + %bc = bitcast i32 %mask to <32 x i1> + %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a0 + ret <32 x half> %sel +} + +define <32 x half> @test_mask_round_vfmadd512_ph_rrb_current(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 %mask) { +; X86-LABEL: test_mask_round_vfmadd512_ph_rrb_current: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x98,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_round_vfmadd512_ph_rrb_current: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x98,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) nounwind + %bc = bitcast i32 %mask to <32 x i1> + %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %a0 + ret <32 x half> %sel +} + +define <32 x half> @test_mask_round_vfmadd512_ph_rrbz_rne(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_ph_rrbz_rne: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph {rn-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x18,0xa8,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 8) nounwind + ret <32 x half> %res +} + +define <32 x half> @test_mask_round_vfmadd512_ph_rrbz_rtn(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_ph_rrbz_rtn: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph {rd-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x38,0xa8,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 9) nounwind + ret <32 x half> %res +} + +define <32 x half> @test_mask_round_vfmadd512_ph_rrbz_rtp(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_ph_rrbz_rtp: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph {ru-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x58,0xa8,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 10) nounwind + ret <32 x half> %res +} + +define <32 x half> @test_mask_round_vfmadd512_ph_rrbz_rtz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_ph_rrbz_rtz: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph {rz-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x78,0xa8,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 11) nounwind + ret <32 x half> %res +} + +define <32 x half> @test_mask_round_vfmadd512_ph_rrbz_current(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { +; CHECK-LABEL: test_mask_round_vfmadd512_ph_rrbz_current: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xa8,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) nounwind + ret <32 x half> %res +} + +define <32 x half>@test_int_x86_avx512_mask3_vfmsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xba,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xba,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <32 x half> , %x2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %1) + %3 = bitcast i32 %x3 to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %x2 + ret <32 x half> %4 +} + +define <32 x half>@test_int_x86_avx512_mask_vfmadd_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){ +; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ph_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x98,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ph_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x98,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 4) + %bc = bitcast i32 %x3 to <32 x i1> + %sel = select <32 x i1> %bc, <32 x half> %res, <32 x half> %x0 + ret <32 x half> %sel +} + +define <32 x half>@test_int_x86_avx512_mask3_vfmadd_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb8,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xb8,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x half> @llvm.fma.v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> %x2 + ret <32 x half> %3 +} + +define <32 x half> @test_int_x86_avx512_maskz_vfmadd_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xc9,0xa8,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xc9,0xa8,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x half> @llvm.fma.v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> zeroinitializer + ret <32 x half> %3 +} + +define <32 x half>@test_int_x86_avx512_mask_vfnmsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){ +; X86-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <32 x half> , %x1 + %2 = fsub <32 x half> , %x2 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %x0, <32 x half> %1, <32 x half> %2) + %4 = bitcast i32 %x3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %x0 + ret <32 x half> %5 +} + +define <32 x half>@test_int_x86_avx512_mask3_vfnmsub_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xbe,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub231ph %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x49,0xbe,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <32 x half> , %x0 + %2 = fsub <32 x half> , %x2 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %1, <32 x half> %x1, <32 x half> %2) + %4 = bitcast i32 %x3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %x2 + ret <32 x half> %5 +} + +define <32 x half>@test_int_x86_avx512_mask_vfnmadd_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3){ +; X86-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd132ph %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x49,0x9c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <32 x half> , %x1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %x0, <32 x half> %1, <32 x half> %x2) + %3 = bitcast i32 %x3 to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %x0 + ret <32 x half> %4 +} + +define <32 x half> @test_x86_fma_vfnmadd_ph_512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmadd_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmadd213ph %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xac,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <32 x half> , %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %1, <32 x half> %a1, <32 x half> %a2) + ret <32 x half> %2 +} + +define <32 x half> @test_x86_fma_vfnmsub_ph_512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmsub_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmsub213ph %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x75,0x48,0xae,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <32 x half> , %a0 + %2 = fsub <32 x half> , %a0 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %1, <32 x half> %a1, <32 x half> %2) + ret <32 x half> %3 +} + +define <8 x half>@test_int_x86_avx512_mask3_vfmadd_sh(<8 x half> %x0, <8 x half> %x1, half *%ptr_b, i8 %x3, i32 %x4) { +; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_sh: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vfmadd231sh (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb9,0x08] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sh: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vfmadd231sh (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb9,0x0f] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %q = load half, half* %ptr_b + %vecinit.i = insertelement <8 x half> undef, half %q, i32 0 + %1 = extractelement <8 x half> %x0, i64 0 + %2 = extractelement <8 x half> %vecinit.i, i64 0 + %3 = extractelement <8 x half> %x1, i64 0 + %4 = call half @llvm.fma.f16(half %1, half %2, half %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, half %4, half %3 + %8 = insertelement <8 x half> %x1, half %7, i64 0 + ret <8 x half> %8 +} + +define <8 x half>@test_int_x86_avx512_maskz_vfmadd_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3, i32 %x4 ){ +; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_sh: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa9,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_sh: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa9,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = extractelement <8 x half> %x0, i64 0 + %2 = extractelement <8 x half> %x1, i64 0 + %3 = extractelement <8 x half> %x2, i64 0 + %4 = call half @llvm.fma.f16(half %1, half %2, half %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, half %4, half 0.000000e+00 + %8 = insertelement <8 x half> %x0, half %7, i64 0 + %9 = extractelement <8 x half> %x0, i64 0 + %10 = extractelement <8 x half> %x1, i64 0 + %11 = extractelement <8 x half> %x2, i64 0 + %12 = call half @llvm.x86.avx512fp16.vfmadd.f16(half %9, half %10, half %11, i32 3) + %13 = bitcast i8 %x3 to <8 x i1> + %14 = extractelement <8 x i1> %13, i64 0 + %15 = select i1 %14, half %12, half 0.000000e+00 + %16 = insertelement <8 x half> %x0, half %15, i64 0 + %res2 = fadd <8 x half> %8, %16 + ret <8 x half> %8 +} + +define void @fmadd_sh_mask_memfold(half* %a, half* %b, i8 %c) { +; X86-LABEL: fmadd_sh_mask_memfold: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x0c] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] +; X86-NEXT: vmovsh (%ecx), %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x01] +; X86-NEXT: vmovsh (%eax), %xmm1 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x08] +; X86-NEXT: vfmadd213sh %xmm0, %xmm0, %xmm1 # encoding: [0x62,0xf6,0x7d,0x08,0xa9,0xc8] +; X86-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc1] +; X86-NEXT: vmovsh %xmm0, (%ecx) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: fmadd_sh_mask_memfold: +; X64: # %bb.0: +; X64-NEXT: vmovsh (%rdi), %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x07] +; X64-NEXT: vmovsh (%rsi), %xmm1 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x0e] +; X64-NEXT: vfmadd213sh %xmm0, %xmm0, %xmm1 # encoding: [0x62,0xf6,0x7d,0x08,0xa9,0xc8] +; X64-NEXT: kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca] +; X64-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc1] +; X64-NEXT: vmovsh %xmm0, (%rdi) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %a.val = load half, half* %a + %av0 = insertelement <8 x half> undef, half %a.val, i32 0 + %av1 = insertelement <8 x half> %av0, half 0.000000e+00, i32 1 + %av2 = insertelement <8 x half> %av1, half 0.000000e+00, i32 2 + %av3 = insertelement <8 x half> %av2, half 0.000000e+00, i32 3 + %av4 = insertelement <8 x half> %av3, half 0.000000e+00, i32 4 + %av5 = insertelement <8 x half> %av4, half 0.000000e+00, i32 5 + %av6 = insertelement <8 x half> %av5, half 0.000000e+00, i32 6 + %av = insertelement <8 x half> %av6, half 0.000000e+00, i32 7 + + %b.val = load half, half* %b + %bv0 = insertelement <8 x half> undef, half %b.val, i32 0 + %bv1 = insertelement <8 x half> %bv0, half 0.000000e+00, i32 1 + %bv2 = insertelement <8 x half> %bv1, half 0.000000e+00, i32 2 + %bv3 = insertelement <8 x half> %bv2, half 0.000000e+00, i32 3 + %bv4 = insertelement <8 x half> %bv3, half 0.000000e+00, i32 4 + %bv5 = insertelement <8 x half> %bv4, half 0.000000e+00, i32 5 + %bv6 = insertelement <8 x half> %bv5, half 0.000000e+00, i32 6 + %bv = insertelement <8 x half> %bv6, half 0.000000e+00, i32 7 + %1 = extractelement <8 x half> %av, i64 0 + %2 = extractelement <8 x half> %bv, i64 0 + %3 = extractelement <8 x half> %av, i64 0 + %4 = call half @llvm.fma.f16(half %1, half %2, half %3) + %5 = bitcast i8 %c to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, half %4, half %1 + %8 = insertelement <8 x half> %av, half %7, i64 0 + %sr = extractelement <8 x half> %8, i32 0 + store half %sr, half* %a + ret void +} + +declare half @llvm.fma.f16(half, half, half) +declare half @llvm.x86.avx512fp16.vfmadd.f16(half, half, half, i32) + +declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>) diff --git a/llvm/test/CodeGen/X86/avx512fp16vl-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16vl-fma-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16vl-fma-intrinsics.ll @@ -0,0 +1,530 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl -mattr=+avx512fp16 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -mattr=+avx512fp16 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 + + +define <16 x half> @test_x86_vfnmadd_ph_z_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { +; CHECK-LABEL: test_x86_vfnmadd_ph_z_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmadd213ph %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x75,0x28,0xac,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <16 x half> , %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %1, <16 x half> %a2) + ret <16 x half> %2 +} + +define <16 x half> @test_mask_vfnmadd_ph_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16 %mask) { +; X86-LABEL: test_mask_vfnmadd_ph_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_vfnmadd_ph_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <16 x half> , %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %1, <16 x half> %a2) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @test_x86_vfnmsubph_z_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { +; CHECK-LABEL: test_x86_vfnmsubph_z_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmsub213ph %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x75,0x28,0xae,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <16 x half> , %a1 + %2 = fsub <16 x half> , %a2 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %1, <16 x half> %2) + ret <16 x half> %3 +} + +define <16 x half> @test_mask_vfnmsub_ph_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16 %mask) { +; X86-LABEL: test_mask_vfnmsub_ph_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_vfnmsub_ph_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <16 x half> , %a1 + %2 = fsub <16 x half> , %a2 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %1, <16 x half> %2) + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> %a0 + ret <16 x half> %5 +} + +define <16 x half>@test_int_x86_avx512_mask3_vfmaddsub_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmaddsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb6,0xd1] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmaddsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb6,0xd1] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2) + %bc = bitcast i16 %x3 to <16 x i1> + %sel = select <16 x i1> %bc, <16 x half> %res, <16 x half> %x2 + ret <16 x half> %sel +} +declare <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half>, <16 x half>, <16 x half>) + +define <16 x half>@test_int_x86_avx512_maskz_vfmaddsub_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){ +; X86-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmaddsub213ph %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xa9,0xa6,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmaddsub213ph %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xa9,0xa6,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2) + %bc = bitcast i16 %x3 to <16 x i1> + %sel = select <16 x i1> %bc, <16 x half> %res, <16 x half> zeroinitializer + ret <16 x half> %sel +} + +define <16 x half>@test_int_x86_avx512_mask3_vfmsubadd_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsubadd231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb7,0xd1] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsubadd231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb7,0xd1] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %neg = fneg <16 x half> %x2 + %res = call <16 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> %neg) + %bc = bitcast i16 %x3 to <16 x i1> + %sel = select <16 x i1> %bc, <16 x half> %res, <16 x half> %x2 + ret <16 x half> %sel +} + +define <16 x half>@test_int_x86_avx512_mask3_vfmsub_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xba,0xd1] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xba,0xd1] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <16 x half> , %x2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %1) + %3 = bitcast i16 %x3 to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %x2 + ret <16 x half> %4 +} + +define <16 x half>@test_int_x86_avx512_mask3_vfmadd_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb8,0xd1] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xb8,0xd1] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x half> @llvm.fma.v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x half> %1, <16 x half> %x2 + ret <16 x half> %3 +} + +define <16 x half> @test_int_x86_avx512_maskz_vfmadd_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xa9,0xa8,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0xa9,0xa8,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x half> @llvm.fma.v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x half> %1, <16 x half> zeroinitializer + ret <16 x half> %3 +} + +define <16 x half>@test_int_x86_avx512_mask_vfnmsub_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){ +; X86-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <16 x half> , %x1 + %2 = fsub <16 x half> , %x2 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %x0, <16 x half> %1, <16 x half> %2) + %4 = bitcast i16 %x3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> %x0 + ret <16 x half> %5 +} + +define <16 x half>@test_int_x86_avx512_mask3_vfnmsub_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xbe,0xd1] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub231ph %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x29,0xbe,0xd1] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <16 x half> , %x0 + %2 = fsub <16 x half> , %x2 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %1, <16 x half> %x1, <16 x half> %2) + %4 = bitcast i16 %x3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> %x2 + ret <16 x half> %5 +} + +define <16 x half>@test_int_x86_avx512_mask_vfnmadd_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3){ +; X86-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd132ph %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x29,0x9c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <16 x half> , %x1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %x0, <16 x half> %1, <16 x half> %x2) + %3 = bitcast i16 %x3 to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %x0 + ret <16 x half> %4 +} + +define <16 x half> @test_x86_fma_vfnmadd_ph_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmadd_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmadd213ph %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x75,0x28,0xac,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <16 x half> , %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %1, <16 x half> %a1, <16 x half> %a2) + ret <16 x half> %2 +} + +define <16 x half> @test_x86_fma_vfnmsub_ph_256(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmsub_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmsub213ph %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x75,0x28,0xae,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <16 x half> , %a0 + %2 = fsub <16 x half> , %a0 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %1, <16 x half> %a1, <16 x half> %2) + ret <16 x half> %3 +} + +declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>) + +define <8 x half> @test_x86_vfnmadd_ph_z_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { +; CHECK-LABEL: test_x86_vfnmadd_ph_z_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmadd213ph %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x75,0x08,0xac,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <8 x half> , %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %1, <8 x half> %a2) + ret <8 x half> %2 +} + +define <8 x half> @test_mask_vfnmadd_ph_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) { +; X86-LABEL: test_mask_vfnmadd_ph_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_vfnmadd_ph_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <8 x half> , %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %1, <8 x half> %a2) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @test_x86_vfnmsubph_z_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { +; CHECK-LABEL: test_x86_vfnmsubph_z_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmsub213ph %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x75,0x08,0xae,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <8 x half> , %a1 + %2 = fsub <8 x half> , %a2 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %1, <8 x half> %2) + ret <8 x half> %3 +} + +define <8 x half> @test_mask_vfnmsub_ph_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) { +; X86-LABEL: test_mask_vfnmsub_ph_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_vfnmsub_ph_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <8 x half> , %a1 + %2 = fsub <8 x half> , %a2 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %1, <8 x half> %2) + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> %a0 + ret <8 x half> %5 +} + +define <8 x half>@test_int_x86_avx512_mask3_vfmaddsub_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmaddsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb6,0xd1] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ph_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmaddsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb6,0xd1] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2) + %bc = bitcast i8 %x3 to <8 x i1> + %sel = select <8 x i1> %bc, <8 x half> %res, <8 x half> %x2 + ret <8 x half> %sel +} +declare <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half>, <8 x half>, <8 x half>) + +define <8 x half>@test_int_x86_avx512_maskz_vfmaddsub_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){ +; X86-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmaddsub213ph %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa6,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ph_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmaddsub213ph %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa6,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2) + %bc = bitcast i8 %x3 to <8 x i1> + %sel = select <8 x i1> %bc, <8 x half> %res, <8 x half> zeroinitializer + ret <8 x half> %sel +} + +define <8 x half>@test_int_x86_avx512_mask3_vfmsubadd_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsubadd231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb7,0xd1] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ph_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsubadd231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb7,0xd1] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %neg = fneg <8 x half> %x2 + %res = call <8 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> %neg) + %bc = bitcast i8 %x3 to <8 x i1> + %sel = select <8 x i1> %bc, <8 x half> %res, <8 x half> %x2 + ret <8 x half> %sel +} + +define <8 x half>@test_int_x86_avx512_mask3_vfmsub_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xba,0xd1] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ph_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xba,0xd1] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <8 x half> , %x2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %1) + %3 = bitcast i8 %x3 to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %x2 + ret <8 x half> %4 +} + +define <8 x half>@test_int_x86_avx512_mask3_vfmadd_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb8,0xd1] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ph_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb8,0xd1] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <8 x half> @llvm.fma.v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x half> %1, <8 x half> %x2 + ret <8 x half> %3 +} + +define <8 x half> @test_int_x86_avx512_maskz_vfmadd_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa8,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ph_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa8,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <8 x half> @llvm.fma.v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x half> %1, <8 x half> zeroinitializer + ret <8 x half> %3 +} + +define <8 x half>@test_int_x86_avx512_mask_vfnmsub_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){ +; X86-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vfnmsub_ph_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <8 x half> , %x1 + %2 = fsub <8 x half> , %x2 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %x0, <8 x half> %1, <8 x half> %2) + %4 = bitcast i8 %x3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> %x0 + ret <8 x half> %5 +} + +define <8 x half>@test_int_x86_avx512_mask3_vfnmsub_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){ +; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xbe,0xd1] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ph_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmsub231ph %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xbe,0xd1] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <8 x half> , %x0 + %2 = fsub <8 x half> , %x2 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %1, <8 x half> %x1, <8 x half> %2) + %4 = bitcast i8 %x3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> %x2 + ret <8 x half> %5 +} + +define <8 x half>@test_int_x86_avx512_mask_vfnmadd_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3){ +; X86-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vfnmadd132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vfnmadd_ph_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vfnmadd132ph %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf6,0x6d,0x09,0x9c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = fsub <8 x half> , %x1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %x0, <8 x half> %1, <8 x half> %x2) + %3 = bitcast i8 %x3 to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %x0 + ret <8 x half> %4 +} + +define <8 x half> @test_x86_fma_vfnmadd_ph_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmadd_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmadd213ph %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x75,0x08,0xac,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <8 x half> , %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %1, <8 x half> %a1, <8 x half> %a2) + ret <8 x half> %2 +} + +define <8 x half> @test_x86_fma_vfnmsub_ph_128(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmsub_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmsub213ph %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf6,0x75,0x08,0xae,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = fsub <8 x half> , %a0 + %2 = fsub <8 x half> , %a0 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %1, <8 x half> %a1, <8 x half> %2) + ret <8 x half> %3 +} + +declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll @@ -11,6 +11,7 @@ declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata) declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata) define half @fadd_f16(half %a, half %b) nounwind strictfp { ; X86-LABEL: fadd_f16: @@ -197,4 +198,22 @@ ret void } +define half @fma_f16(half %a, half %b, half %c) nounwind strictfp { +; X86-LABEL: fma_f16: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vfmadd213sh {{[0-9]+}}(%esp), %xmm1, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: fma_f16: +; X64: # %bb.0: +; X64-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0 +; X64-NEXT: retq + %res = call half @llvm.experimental.constrained.fma.f16(half %a, half %b, half %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll @@ -0,0 +1,2526 @@ +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; Stack reload folding tests. +; +; By including a nop call with sideeffects we can force a partial register spill of the +; relevant registers and check that the reload is correctly folded into the instruction. + +define <32 x half> @stack_fold_fmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd123ph: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) + ret <32 x half> %2 +} +declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>) + +define <32 x half> @stack_fold_fmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd213ph: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_fmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd231ph: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_fmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd321ph: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_fmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd132ph: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_fmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd312ph: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_fmadd123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmadd123ph_mask: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmadd213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmadd213ph_mask: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmadd231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmadd231ph_mask: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmadd321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmadd321ph_mask: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmadd132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmadd132ph_mask: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmadd312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmadd312ph_mask: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmadd123ph_maskz: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmadd213ph_maskz: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmadd231ph_maskz: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmadd321ph_maskz: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmadd132ph_maskz: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmadd312ph_maskz: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub123ph: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a2 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %2) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub213ph: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a2 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %2) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub231ph: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a0 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %2) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub321ph: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a0 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %2) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub132ph: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a1 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %2) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub312ph: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a1 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %2) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsub123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsub123ph_mask: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsub213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsub213ph_mask: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsub231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsub231ph_mask: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsub321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsub321ph_mask: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsub132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsub132ph_mask: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsub312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsub312ph_mask: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsub123ph_maskz: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsub213ph_maskz: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsub231ph_maskz: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsub321ph_maskz: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsub132ph_maskz: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsub312ph_maskz: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd123ph: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a0 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a2) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fnmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd213ph: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a1 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a2) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fnmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd231ph: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a1 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a0) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fnmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd321ph: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a2 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a0) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fnmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd132ph: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a0 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a1) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fnmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd312ph: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a2 + %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a1) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fnmadd123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd123ph_mask: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmadd213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd213ph_mask: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmadd231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd231ph_mask: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmadd321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd321ph_mask: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmadd132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd132ph_mask: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmadd312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd312ph_mask: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd123ph_maskz: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd213ph_maskz: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd231ph_maskz: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd321ph_maskz: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd132ph_maskz: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd312ph_maskz: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub123ph: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a0 + %3 = fneg <32 x half> %a2 + %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3) + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub213ph: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a1 + %3 = fneg <32 x half> %a2 + %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3) + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub231ph: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a1 + %3 = fneg <32 x half> %a0 + %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3) + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub321ph: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a2 + %3 = fneg <32 x half> %a0 + %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3) + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub132ph: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a0 + %3 = fneg <32 x half> %a1 + %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3) + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub312ph: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a2 + %3 = fneg <32 x half> %a1 + %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3) + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub123ph_mask: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a2 + %neg1 = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub213ph_mask: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a2 + %neg1 = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub231ph_mask: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a0 + %neg1 = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub321ph_mask: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a0 + %neg1 = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub132ph_mask: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a1 + %neg1 = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub312ph_mask: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a1 + %neg1 = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fnmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub123ph_maskz: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a2 + %neg1 = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub213ph_maskz: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a2 + %neg1 = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub231ph_maskz: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a0 + %neg1 = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub321ph_maskz: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a0 + %neg1 = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub132ph_maskz: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a1 + %neg1 = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fnmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub312ph_maskz: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a1 + %neg1 = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define half @stack_fold_fmadd123sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmadd123sh: + ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2) + ret half %2 +} +declare half @llvm.fma.f16(half, half, half) + +define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmadd213sh: + ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2) + ret half %2 +} + +define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmadd231sh: + ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0) + ret half %2 +} + +define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmadd321sh: + ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0) + ret half %2 +} + +define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmadd132sh: + ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1) + ret half %2 +} + +define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmadd312sh: + ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1) + ret half %2 +} + +define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmsub123sh: + ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a2 + %3 = call half @llvm.fma.f16(half %a0, half %a1, half %2) + ret half %3 +} + +define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmsub213sh: + ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a2 + %3 = call half @llvm.fma.f16(half %a1, half %a0, half %2) + ret half %3 +} + +define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmsub231sh: + ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a0 + %3 = call half @llvm.fma.f16(half %a1, half %a2, half %2) + ret half %3 +} + +define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmsub321sh: + ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a0 + %3 = call half @llvm.fma.f16(half %a2, half %a1, half %2) + ret half %3 +} + +define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmsub132sh: + ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a1 + %3 = call half @llvm.fma.f16(half %a0, half %a2, half %2) + ret half %3 +} + +define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fmsub312sh: + ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a1 + %3 = call half @llvm.fma.f16(half %a2, half %a0, half %2) + ret half %3 +} + +define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmadd123sh: + ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a0 + %3 = call half @llvm.fma.f16(half %2, half %a1, half %a2) + ret half %3 +} + +define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmadd213sh: + ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a1 + %3 = call half @llvm.fma.f16(half %2, half %a0, half %a2) + ret half %3 +} + +define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmadd231sh: + ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a1 + %3 = call half @llvm.fma.f16(half %2, half %a2, half %a0) + ret half %3 +} + +define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmadd321sh: + ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a2 + %3 = call half @llvm.fma.f16(half %2, half %a1, half %a0) + ret half %3 +} + +define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmadd132sh: + ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a0 + %3 = call half @llvm.fma.f16(half %2, half %a2, half %a1) + ret half %3 +} + +define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmadd312sh: + ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a2 + %3 = call half @llvm.fma.f16(half %2, half %a0, half %a1) + ret half %3 +} + +define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmsub123sh: + ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a0 + %3 = fneg half %a2 + %4 = call half @llvm.fma.f16(half %2, half %a1, half %3) + ret half %4 +} + +define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmsub213sh: + ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a1 + %3 = fneg half %a2 + %4 = call half @llvm.fma.f16(half %2, half %a0, half %3) + ret half %4 +} + +define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmsub231sh: + ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a1 + %3 = fneg half %a0 + %4 = call half @llvm.fma.f16(half %2, half %a2, half %3) + ret half %4 +} + +define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmsub321sh: + ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a2 + %3 = fneg half %a0 + %4 = call half @llvm.fma.f16(half %2, half %a1, half %3) + ret half %4 +} + +define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmsub132sh: + ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a0 + %3 = fneg half %a1 + %4 = call half @llvm.fma.f16(half %2, half %a2, half %3) + ret half %4 +} + +define half @stack_fold_fnmsub312sh(half %a0, half %a1, half %a2) { + ;CHECK-LABEL: stack_fold_fnmsub312sh: + ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg half %a2 + %3 = fneg half %a1 + %4 = call half @llvm.fma.f16(half %2, half %a0, half %3) + ret half %4 +} + +define <8 x half> @stack_fold_fmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmadd123sh_int: + ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmadd213sh_int: + ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmadd231sh_int: + ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmadd321sh_int: + ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmadd132sh_int: + ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmadd312sh_int: + ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmsub123sh_int: + ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmsub213sh_int: + ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmsub231sh_int: + ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmsub321sh_int: + ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmsub132sh_int: + ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fmsub312sh_int: + ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmadd123sh_int: + ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmadd213sh_int: + ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmadd231sh_int: + ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmadd321sh_int: + ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmadd132sh_int: + ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmadd312sh_int: + ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmsub123sh_int: + ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmsub213sh_int: + ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmsub231sh_int: + ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmsub321sh_int: + ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmsub132sh_int: + ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { + ;CHECK-LABEL: stack_fold_fnmsub312sh_int: + ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) + %res = insertelement <8 x half> %a0v, half %2, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd123sh_intk: + ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd213sh_intk: + ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd231sh_intk: + ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd321sh_intk: + ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd132sh_intk: + ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd312sh_intk: + ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub123sh_intk: + ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub213sh_intk: + ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub231sh_intk: + ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub321sh_intk: + ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub132sh_intk: + ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub312sh_intk: + ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd123sh_intk: + ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd213sh_intk: + ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd231sh_intk: + ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd321sh_intk: + ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd132sh_intk: + ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd312sh_intk: + ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub123sh_intk: + ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub213sh_intk: + ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub231sh_intk: + ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub321sh_intk: + ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub132sh_intk: + ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub312sh_intk: + ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half %a0 + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd123sh_intkz: + ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd213sh_intkz: + ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd231sh_intkz: + ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd321sh_intkz: + ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd132sh_intkz: + ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd312sh_intkz: + ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub123sh_intkz: + ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub213sh_intkz: + ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub231sh_intkz: + ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub321sh_intkz: + ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub132sh_intkz: + ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub312sh_intkz: + ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd123sh_intkz: + ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd213sh_intkz: + ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd231sh_intkz: + ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd321sh_intkz: + ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd132sh_intkz: + ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd312sh_intkz: + ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub123sh_intkz: + ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub213sh_intkz: + ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a2 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub231sh_intkz: + ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %neg1 = fneg half %a1 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub321sh_intkz: + ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a0 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub132sh_intkz: + ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %neg1 = fneg half %a0 + %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <8 x half> @stack_fold_fnmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub312sh_intkz: + ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = extractelement <8 x half> %a0v, i64 0 + %a1 = extractelement <8 x half> %a1v, i64 0 + %a2 = extractelement <8 x half> %a2v, i64 0 + %neg = fneg half %a1 + %neg1 = fneg half %a2 + %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, half %2, half zeroinitializer + %res = insertelement <8 x half> %a0v, half %6, i64 0 + ret <8 x half> %res +} + +define <32 x half> @stack_fold_fmaddsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmaddsub123ph: + ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) + ret <32 x half> %2 +} +declare <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half>, <32 x half>, <32 x half>, i32) + +define <32 x half> @stack_fold_fmaddsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmaddsub213ph: + ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_fmaddsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmaddsub231ph: + ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_fmaddsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmaddsub321ph: + ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_fmaddsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmaddsub132ph: + ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_fmaddsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmaddsub312ph: + ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_fmaddsub123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub123ph_mask: + ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmaddsub213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub213ph_mask: + ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmaddsub231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub231ph_mask: + ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmaddsub321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub321ph_mask: + ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmaddsub132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub132ph_mask: + ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmaddsub312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub312ph_mask: + ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmaddsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub123ph_maskz: + ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmaddsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub213ph_maskz: + ;CHECK: vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmaddsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub231ph_maskz: + ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmaddsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub321ph_maskz: + ;CHECK: vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmaddsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub132ph_maskz: + ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmaddsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmaddsub312ph_maskz: + ;CHECK: vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsubadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsubadd123ph: + ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a2 + %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %2, i32 4) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsubadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsubadd213ph: + ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a2 + %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %2, i32 4) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsubadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsubadd231ph: + ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a0 + %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %2, i32 4) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsubadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsubadd321ph: + ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a0 + %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %2, i32 4) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsubadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsubadd132ph: + ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a1 + %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %2, i32 4) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsubadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsubadd312ph: + ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <32 x half> %a1 + %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %2, i32 4) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_fmsubadd123ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd123ph_mask: + ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsubadd213ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd213ph_mask: + ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsubadd231ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd231ph_mask: + ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsubadd321ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd321ph_mask: + ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsubadd132ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd132ph_mask: + ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsubadd312ph_mask(<32 x half>* %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd312ph_mask: + ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <32 x half>, <32 x half>* %p + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_fmsubadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd123ph_maskz: + ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsubadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd213ph_maskz: + ;CHECK: vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a2 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsubadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd231ph_maskz: + ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsubadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd321ph_maskz: + ;CHECK: vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a0 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsubadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd132ph_maskz: + ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_fmsubadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32* %mask) { + ;CHECK-LABEL: stack_fold_fmsubadd312ph_maskz: + ;CHECK: vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <32 x half> %a1 + %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4) + %3 = load i32, i32* %mask + %4 = bitcast i32 %3 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %5 +} diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll @@ -0,0 +1,1595 @@ +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; Stack reload folding tests. +; +; By including a nop call with sideeffects we can force a partial register spill of the +; relevant registers and check that the reload is correctly folded into the instruction. + +define <8 x half> @stack_fold_fmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd123ph: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) + ret <8 x half> %2 +} +declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) + +define <8 x half> @stack_fold_fmadd213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd213ph: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2) + ret <8 x half> %2 +} + +define <8 x half> @stack_fold_fmadd231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd231ph: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0) + ret <8 x half> %2 +} + +define <8 x half> @stack_fold_fmadd321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd321ph: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0) + ret <8 x half> %2 +} + +define <8 x half> @stack_fold_fmadd132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd132ph: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1) + ret <8 x half> %2 +} + +define <8 x half> @stack_fold_fmadd312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd312ph: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1) + ret <8 x half> %2 +} + +define <8 x half> @stack_fold_fmadd123ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmadd123ph_mask: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmadd213ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmadd213ph_mask: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmadd231ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmadd231ph_mask: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmadd321ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmadd321ph_mask: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmadd132ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmadd132ph_mask: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmadd312ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmadd312ph_mask: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd123ph_maskz: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd213ph_maskz: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd231ph_maskz: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd321ph_maskz: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd132ph_maskz: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmadd312ph_maskz: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub123ph: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a2 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %2) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub213ph: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a2 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %2) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub231ph: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a0 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %2) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub321ph: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a0 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %2) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub132ph: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a1 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %2) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub312ph: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a1 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %2) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fmsub123ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmsub123ph_mask: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmsub213ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmsub213ph_mask: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmsub231ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmsub231ph_mask: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmsub321ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmsub321ph_mask: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmsub132ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmsub132ph_mask: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmsub312ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fmsub312ph_mask: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub123ph_maskz: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub213ph_maskz: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub231ph_maskz: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub321ph_maskz: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub132ph_maskz: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fmsub312ph_maskz: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd123ph: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a0 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %a2) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fnmadd213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd213ph: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a1 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %a2) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fnmadd231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd231ph: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a1 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %a0) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fnmadd321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd321ph: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a2 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %a0) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fnmadd132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd132ph: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a0 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %a1) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fnmadd312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd312ph: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a2 + %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %a1) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_fnmadd123ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd123ph_mask: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a2) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmadd213ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd213ph_mask: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a2) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmadd231ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd231ph_mask: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a0) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmadd321ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd321ph_mask: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a0) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmadd132ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd132ph_mask: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a1) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmadd312ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd312ph_mask: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a1) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd123ph_maskz: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd213ph_maskz: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a2) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd231ph_maskz: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd321ph_maskz: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a0) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd132ph_maskz: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd312ph_maskz: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a1) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub123ph: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a0 + %3 = fneg <8 x half> %a2 + %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %3) + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub213ph: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a1 + %3 = fneg <8 x half> %a2 + %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %3) + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub231ph: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a1 + %3 = fneg <8 x half> %a0 + %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %3) + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub321ph: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a2 + %3 = fneg <8 x half> %a0 + %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %3) + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub132ph: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a0 + %3 = fneg <8 x half> %a1 + %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %3) + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub312ph: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <8 x half> %a2 + %3 = fneg <8 x half> %a1 + %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %3) + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub123ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub123ph_mask: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a2 + %neg1 = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub213ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub213ph_mask: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a2 + %neg1 = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub231ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub231ph_mask: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a0 + %neg1 = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub321ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub321ph_mask: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a0 + %neg1 = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub132ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub132ph_mask: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a1 + %neg1 = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub312ph_mask(<8 x half>* %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub312ph_mask: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x half>, <8 x half>* %p + %neg = fneg <8 x half> %a1 + %neg1 = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg) + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 + ret <8 x half> %4 +} + +define <8 x half> @stack_fold_fnmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub123ph_maskz: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a2 + %neg1 = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub213ph_maskz: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a2 + %neg1 = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub231ph_maskz: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a0 + %neg1 = fneg <8 x half> %a1 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub321ph_maskz: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a0 + %neg1 = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub132ph_maskz: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a1 + %neg1 = fneg <8 x half> %a0 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_fnmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub312ph_maskz: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <8 x half> %a1 + %neg1 = fneg <8 x half> %a2 + %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg) + %3 = load i8, i8* %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <16 x half> @stack_fold_fmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd123ph_ymm: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) + ret <16 x half> %2 +} +declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>) + +define <16 x half> @stack_fold_fmadd213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd213ph_ymm: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2) + ret <16 x half> %2 +} + +define <16 x half> @stack_fold_fmadd231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd231ph_ymm: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0) + ret <16 x half> %2 +} + +define <16 x half> @stack_fold_fmadd321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd321ph_ymm: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0) + ret <16 x half> %2 +} + +define <16 x half> @stack_fold_fmadd132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd132ph_ymm: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1) + ret <16 x half> %2 +} + +define <16 x half> @stack_fold_fmadd312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmadd312ph_ymm: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1) + ret <16 x half> %2 +} + +define <16 x half> @stack_fold_fmadd123ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmadd123ph_mask_ymm: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmadd213ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmadd213ph_mask_ymm: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmadd231ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmadd231ph_mask_ymm: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmadd321ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmadd321ph_mask_ymm: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmadd132ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmadd132ph_mask_ymm: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmadd312ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmadd312ph_mask_ymm: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmadd123ph_maskz_ymm: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmadd213ph_maskz_ymm: + ;CHECK: vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmadd231ph_maskz_ymm: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmadd321ph_maskz_ymm: + ;CHECK: vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmadd132ph_maskz_ymm: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmadd312ph_maskz_ymm: + ;CHECK: vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub123ph_ymm: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a2 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %2) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub213ph_ymm: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a2 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %2) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub231ph_ymm: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a0 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %2) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub321ph_ymm: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a0 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %2) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub132ph_ymm: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a1 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %2) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fmsub312ph_ymm: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a1 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %2) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fmsub123ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmsub123ph_mask_ymm: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmsub213ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmsub213ph_mask_ymm: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmsub231ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmsub231ph_mask_ymm: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmsub321ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmsub321ph_mask_ymm: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmsub132ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmsub132ph_mask_ymm: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmsub312ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fmsub312ph_mask_ymm: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmsub123ph_maskz_ymm: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmsub213ph_maskz_ymm: + ;CHECK: vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmsub231ph_maskz_ymm: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmsub321ph_maskz_ymm: + ;CHECK: vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmsub132ph_maskz_ymm: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fmsub312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fmsub312ph_maskz_ymm: + ;CHECK: vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd123ph_ymm: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a0 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %a2) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fnmadd213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd213ph_ymm: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a1 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %a2) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fnmadd231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd231ph_ymm: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a1 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %a0) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fnmadd321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd321ph_ymm: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a2 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %a0) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fnmadd132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd132ph_ymm: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a0 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %a1) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fnmadd312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmadd312ph_ymm: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a2 + %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %a1) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_fnmadd123ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd123ph_mask_ymm: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a2) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmadd213ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd213ph_mask_ymm: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a2) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmadd231ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd231ph_mask_ymm: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a0) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmadd321ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd321ph_mask_ymm: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a0) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmadd132ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd132ph_mask_ymm: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a1) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmadd312ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmadd312ph_mask_ymm: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a1) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd123ph_maskz_ymm: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a2) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd213ph_maskz_ymm: + ;CHECK: vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a2) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd231ph_maskz_ymm: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a0) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd321ph_maskz_ymm: + ;CHECK: vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a0) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd132ph_maskz_ymm: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a1) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmadd312ph_maskz_ymm: + ;CHECK: vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a1) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub123ph_ymm: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a0 + %3 = fneg <16 x half> %a2 + %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %3) + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub213ph_ymm: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a1 + %3 = fneg <16 x half> %a2 + %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %3) + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub231ph_ymm: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a1 + %3 = fneg <16 x half> %a0 + %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %3) + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub321ph_ymm: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a2 + %3 = fneg <16 x half> %a0 + %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %3) + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub132ph_ymm: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a0 + %3 = fneg <16 x half> %a1 + %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %3) + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { + ;CHECK-LABEL: stack_fold_fnmsub312ph_ymm: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fneg <16 x half> %a2 + %3 = fneg <16 x half> %a1 + %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %3) + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub123ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub123ph_mask_ymm: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a2 + %neg1 = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub213ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub213ph_mask_ymm: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a2 + %neg1 = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub231ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub231ph_mask_ymm: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a0 + %neg1 = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub321ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub321ph_mask_ymm: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a0 + %neg1 = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub132ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub132ph_mask_ymm: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a1 + %neg1 = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub312ph_mask_ymm(<16 x half>* %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_fnmsub312ph_mask_ymm: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x half>, <16 x half>* %p + %neg = fneg <16 x half> %a1 + %neg1 = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg) + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 + ret <16 x half> %4 +} + +define <16 x half> @stack_fold_fnmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub123ph_maskz_ymm: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a2 + %neg1 = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub213ph_maskz_ymm: + ;CHECK: vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a2 + %neg1 = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub231ph_maskz_ymm: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a0 + %neg1 = fneg <16 x half> %a1 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub321ph_maskz_ymm: + ;CHECK: vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a0 + %neg1 = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub132ph_maskz_ymm: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a1 + %neg1 = fneg <16 x half> %a0 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_fnmsub312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, i16* %mask) { + ;CHECK-LABEL: stack_fold_fnmsub312ph_maskz_ymm: + ;CHECK: vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %neg = fneg <16 x half> %a1 + %neg1 = fneg <16 x half> %a2 + %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg) + %3 = load i16, i16* %mask + %4 = bitcast i16 %3 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer + ret <16 x half> %5 +} diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll @@ -17,6 +17,7 @@ declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half>, metadata) +declare <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half>, <8 x half>, <8 x half>, metadata, metadata) define <8 x half> @f2(<8 x half> %a, <8 x half> %b) #0 { ; CHECK-LABEL: f2: @@ -101,6 +102,17 @@ ret <2 x double> %res } +define <8 x half> @f13(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: f13: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %res +} + define <2 x double> @f15(<2 x half> %a) #0 { ; CHECK-LABEL: f15: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll @@ -7,6 +7,7 @@ declare <16 x half> @llvm.experimental.constrained.fmul.v16f16(<16 x half>, <16 x half>, metadata, metadata) declare <16 x half> @llvm.experimental.constrained.fdiv.v16f16(<16 x half>, <16 x half>, metadata, metadata) declare <16 x half> @llvm.experimental.constrained.sqrt.v16f16(<16 x half>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.fma.v16f16(<16 x half>, <16 x half>, <16 x half>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata) declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata) declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f64(<4 x double>, metadata, metadata) @@ -98,6 +99,17 @@ ret <4 x half> %ret } +define <16 x half> @f13(<16 x half> %a, <16 x half> %b, <16 x half> %c) #0 { +; CHECK-LABEL: f13: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <16 x half> @llvm.experimental.constrained.fma.v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %res +} + define <8 x float> @f14(<8 x half> %a) #0 { ; CHECK-LABEL: f14: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll @@ -11,6 +11,7 @@ declare <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half>, metadata) declare <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f64(<8 x double>, metadata, metadata) declare <16 x half> @llvm.experimental.constrained.fptrunc.v16f16.v16f32(<16 x float>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.fma.v32f16(<32 x half>, <32 x half>, <32 x half>, metadata, metadata) declare <32 x half> @llvm.experimental.constrained.ceil.v32f16(<32 x half>, metadata) declare <32 x half> @llvm.experimental.constrained.floor.v32f16(<32 x half>, metadata) declare <32 x half> @llvm.experimental.constrained.trunc.v32f16(<32 x half>, metadata) @@ -97,6 +98,17 @@ ret <8 x half> %ret } +define <32 x half> @f13(<32 x half> %a, <32 x half> %b, <32 x half> %c) #0 { +; CHECK-LABEL: f13: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ph %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <32 x half> @llvm.experimental.constrained.fma.v32f16(<32 x half> %a, <32 x half> %b, <32 x half> %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %res +} + define <16 x float> @f14(<16 x half> %a) #0 { ; CHECK-LABEL: f14: ; CHECK: # %bb.0: diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt --- a/llvm/test/MC/Disassembler/X86/avx512fp16.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt @@ -1764,3 +1764,723 @@ # ATT: vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} # INTEL: vsqrtsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] 0x62,0x65,0x16,0x87,0x51,0x72,0x80 + +# ATT: vfmadd132ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmadd132ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0x98,0xf4 + +# ATT: vfmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmadd132ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x98,0xf4 + +# ATT: vfmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmadd132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd132ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmadd132ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0x98,0x31 + +# ATT: vfmadd132ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmadd132ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0x98,0x71,0x7f + +# ATT: vfmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmadd132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0x98,0x72,0x80 + +# ATT: vfmadd132sh %xmm28, %xmm29, %xmm30 +# INTEL: vfmadd132sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0x99,0xf4 + +# ATT: vfmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfmadd132sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x99,0xf4 + +# ATT: vfmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfmadd132sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0x99,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd132sh (%r9), %xmm29, %xmm30 +# INTEL: vfmadd132sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0x99,0x31 + +# ATT: vfmadd132sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfmadd132sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0x99,0x71,0x7f + +# ATT: vfmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfmadd132sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0x99,0x72,0x80 + +# ATT: vfmadd213ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmadd213ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xa8,0xf4 + +# ATT: vfmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmadd213ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xa8,0xf4 + +# ATT: vfmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmadd213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd213ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmadd213ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xa8,0x31 + +# ATT: vfmadd213ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmadd213ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xa8,0x71,0x7f + +# ATT: vfmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmadd213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xa8,0x72,0x80 + +# ATT: vfmadd213sh %xmm28, %xmm29, %xmm30 +# INTEL: vfmadd213sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0xa9,0xf4 + +# ATT: vfmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfmadd213sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xa9,0xf4 + +# ATT: vfmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfmadd213sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0xa9,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd213sh (%r9), %xmm29, %xmm30 +# INTEL: vfmadd213sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0xa9,0x31 + +# ATT: vfmadd213sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfmadd213sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0xa9,0x71,0x7f + +# ATT: vfmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfmadd213sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0xa9,0x72,0x80 + +# ATT: vfmadd231ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmadd231ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xb8,0xf4 + +# ATT: vfmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmadd231ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xb8,0xf4 + +# ATT: vfmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmadd231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd231ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmadd231ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xb8,0x31 + +# ATT: vfmadd231ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmadd231ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xb8,0x71,0x7f + +# ATT: vfmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmadd231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xb8,0x72,0x80 + +# ATT: vfmadd231sh %xmm28, %xmm29, %xmm30 +# INTEL: vfmadd231sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0xb9,0xf4 + +# ATT: vfmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfmadd231sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xb9,0xf4 + +# ATT: vfmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfmadd231sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0xb9,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmadd231sh (%r9), %xmm29, %xmm30 +# INTEL: vfmadd231sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0xb9,0x31 + +# ATT: vfmadd231sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfmadd231sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0xb9,0x71,0x7f + +# ATT: vfmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfmadd231sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0xb9,0x72,0x80 + +# ATT: vfmaddsub132ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmaddsub132ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0x96,0xf4 + +# ATT: vfmaddsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmaddsub132ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x96,0xf4 + +# ATT: vfmaddsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmaddsub132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0x96,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmaddsub132ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmaddsub132ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0x96,0x31 + +# ATT: vfmaddsub132ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmaddsub132ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0x96,0x71,0x7f + +# ATT: vfmaddsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmaddsub132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0x96,0x72,0x80 + +# ATT: vfmaddsub213ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmaddsub213ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xa6,0xf4 + +# ATT: vfmaddsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmaddsub213ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xa6,0xf4 + +# ATT: vfmaddsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmaddsub213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xa6,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmaddsub213ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmaddsub213ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xa6,0x31 + +# ATT: vfmaddsub213ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmaddsub213ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xa6,0x71,0x7f + +# ATT: vfmaddsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmaddsub213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xa6,0x72,0x80 + +# ATT: vfmaddsub231ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmaddsub231ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xb6,0xf4 + +# ATT: vfmaddsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmaddsub231ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xb6,0xf4 + +# ATT: vfmaddsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmaddsub231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xb6,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmaddsub231ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmaddsub231ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xb6,0x31 + +# ATT: vfmaddsub231ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmaddsub231ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xb6,0x71,0x7f + +# ATT: vfmaddsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmaddsub231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xb6,0x72,0x80 + +# ATT: vfmsub132ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmsub132ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0x9a,0xf4 + +# ATT: vfmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmsub132ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x9a,0xf4 + +# ATT: vfmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmsub132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub132ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmsub132ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0x9a,0x31 + +# ATT: vfmsub132ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmsub132ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0x9a,0x71,0x7f + +# ATT: vfmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmsub132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0x9a,0x72,0x80 + +# ATT: vfmsub132sh %xmm28, %xmm29, %xmm30 +# INTEL: vfmsub132sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0x9b,0xf4 + +# ATT: vfmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfmsub132sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x9b,0xf4 + +# ATT: vfmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfmsub132sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0x9b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub132sh (%r9), %xmm29, %xmm30 +# INTEL: vfmsub132sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0x9b,0x31 + +# ATT: vfmsub132sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfmsub132sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0x9b,0x71,0x7f + +# ATT: vfmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfmsub132sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0x9b,0x72,0x80 + +# ATT: vfmsub213ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmsub213ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xaa,0xf4 + +# ATT: vfmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmsub213ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xaa,0xf4 + +# ATT: vfmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmsub213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub213ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmsub213ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xaa,0x31 + +# ATT: vfmsub213ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmsub213ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xaa,0x71,0x7f + +# ATT: vfmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmsub213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xaa,0x72,0x80 + +# ATT: vfmsub213sh %xmm28, %xmm29, %xmm30 +# INTEL: vfmsub213sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0xab,0xf4 + +# ATT: vfmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfmsub213sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xab,0xf4 + +# ATT: vfmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfmsub213sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0xab,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub213sh (%r9), %xmm29, %xmm30 +# INTEL: vfmsub213sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0xab,0x31 + +# ATT: vfmsub213sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfmsub213sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0xab,0x71,0x7f + +# ATT: vfmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfmsub213sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0xab,0x72,0x80 + +# ATT: vfmsub231ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmsub231ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xba,0xf4 + +# ATT: vfmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmsub231ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xba,0xf4 + +# ATT: vfmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmsub231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub231ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmsub231ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xba,0x31 + +# ATT: vfmsub231ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmsub231ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xba,0x71,0x7f + +# ATT: vfmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmsub231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xba,0x72,0x80 + +# ATT: vfmsub231sh %xmm28, %xmm29, %xmm30 +# INTEL: vfmsub231sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0xbb,0xf4 + +# ATT: vfmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfmsub231sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xbb,0xf4 + +# ATT: vfmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfmsub231sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0xbb,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsub231sh (%r9), %xmm29, %xmm30 +# INTEL: vfmsub231sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0xbb,0x31 + +# ATT: vfmsub231sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfmsub231sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0xbb,0x71,0x7f + +# ATT: vfmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfmsub231sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0xbb,0x72,0x80 + +# ATT: vfmsubadd132ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmsubadd132ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0x97,0xf4 + +# ATT: vfmsubadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmsubadd132ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x97,0xf4 + +# ATT: vfmsubadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmsubadd132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0x97,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsubadd132ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmsubadd132ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0x97,0x31 + +# ATT: vfmsubadd132ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmsubadd132ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0x97,0x71,0x7f + +# ATT: vfmsubadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmsubadd132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0x97,0x72,0x80 + +# ATT: vfmsubadd213ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmsubadd213ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xa7,0xf4 + +# ATT: vfmsubadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmsubadd213ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xa7,0xf4 + +# ATT: vfmsubadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmsubadd213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xa7,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsubadd213ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmsubadd213ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xa7,0x31 + +# ATT: vfmsubadd213ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmsubadd213ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xa7,0x71,0x7f + +# ATT: vfmsubadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmsubadd213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xa7,0x72,0x80 + +# ATT: vfmsubadd231ph %zmm28, %zmm29, %zmm30 +# INTEL: vfmsubadd231ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xb7,0xf4 + +# ATT: vfmsubadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfmsubadd231ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xb7,0xf4 + +# ATT: vfmsubadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfmsubadd231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xb7,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfmsubadd231ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfmsubadd231ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xb7,0x31 + +# ATT: vfmsubadd231ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfmsubadd231ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xb7,0x71,0x7f + +# ATT: vfmsubadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfmsubadd231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xb7,0x72,0x80 + +# ATT: vfnmadd132ph %zmm28, %zmm29, %zmm30 +# INTEL: vfnmadd132ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0x9c,0xf4 + +# ATT: vfnmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfnmadd132ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x9c,0xf4 + +# ATT: vfnmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfnmadd132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd132ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfnmadd132ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0x9c,0x31 + +# ATT: vfnmadd132ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfnmadd132ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0x9c,0x71,0x7f + +# ATT: vfnmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfnmadd132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0x9c,0x72,0x80 + +# ATT: vfnmadd132sh %xmm28, %xmm29, %xmm30 +# INTEL: vfnmadd132sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0x9d,0xf4 + +# ATT: vfnmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfnmadd132sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x9d,0xf4 + +# ATT: vfnmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfnmadd132sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0x9d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd132sh (%r9), %xmm29, %xmm30 +# INTEL: vfnmadd132sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0x9d,0x31 + +# ATT: vfnmadd132sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfnmadd132sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0x9d,0x71,0x7f + +# ATT: vfnmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfnmadd132sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0x9d,0x72,0x80 + +# ATT: vfnmadd213ph %zmm28, %zmm29, %zmm30 +# INTEL: vfnmadd213ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xac,0xf4 + +# ATT: vfnmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfnmadd213ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xac,0xf4 + +# ATT: vfnmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfnmadd213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd213ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfnmadd213ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xac,0x31 + +# ATT: vfnmadd213ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfnmadd213ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xac,0x71,0x7f + +# ATT: vfnmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfnmadd213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xac,0x72,0x80 + +# ATT: vfnmadd213sh %xmm28, %xmm29, %xmm30 +# INTEL: vfnmadd213sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0xad,0xf4 + +# ATT: vfnmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfnmadd213sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xad,0xf4 + +# ATT: vfnmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfnmadd213sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0xad,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd213sh (%r9), %xmm29, %xmm30 +# INTEL: vfnmadd213sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0xad,0x31 + +# ATT: vfnmadd213sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfnmadd213sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0xad,0x71,0x7f + +# ATT: vfnmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfnmadd213sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0xad,0x72,0x80 + +# ATT: vfnmadd231ph %zmm28, %zmm29, %zmm30 +# INTEL: vfnmadd231ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xbc,0xf4 + +# ATT: vfnmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfnmadd231ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xbc,0xf4 + +# ATT: vfnmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfnmadd231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd231ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfnmadd231ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xbc,0x31 + +# ATT: vfnmadd231ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfnmadd231ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xbc,0x71,0x7f + +# ATT: vfnmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfnmadd231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xbc,0x72,0x80 + +# ATT: vfnmadd231sh %xmm28, %xmm29, %xmm30 +# INTEL: vfnmadd231sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0xbd,0xf4 + +# ATT: vfnmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfnmadd231sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xbd,0xf4 + +# ATT: vfnmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfnmadd231sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0xbd,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd231sh (%r9), %xmm29, %xmm30 +# INTEL: vfnmadd231sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0xbd,0x31 + +# ATT: vfnmadd231sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfnmadd231sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0xbd,0x71,0x7f + +# ATT: vfnmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfnmadd231sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0xbd,0x72,0x80 + +# ATT: vfnmsub132ph %zmm28, %zmm29, %zmm30 +# INTEL: vfnmsub132ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0x9e,0xf4 + +# ATT: vfnmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfnmsub132ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x9e,0xf4 + +# ATT: vfnmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfnmsub132ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub132ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfnmsub132ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0x9e,0x31 + +# ATT: vfnmsub132ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfnmsub132ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0x9e,0x71,0x7f + +# ATT: vfnmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfnmsub132ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0x9e,0x72,0x80 + +# ATT: vfnmsub132sh %xmm28, %xmm29, %xmm30 +# INTEL: vfnmsub132sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0x9f,0xf4 + +# ATT: vfnmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfnmsub132sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x9f,0xf4 + +# ATT: vfnmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfnmsub132sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0x9f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub132sh (%r9), %xmm29, %xmm30 +# INTEL: vfnmsub132sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0x9f,0x31 + +# ATT: vfnmsub132sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfnmsub132sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0x9f,0x71,0x7f + +# ATT: vfnmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfnmsub132sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0x9f,0x72,0x80 + +# ATT: vfnmsub213ph %zmm28, %zmm29, %zmm30 +# INTEL: vfnmsub213ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xae,0xf4 + +# ATT: vfnmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfnmsub213ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xae,0xf4 + +# ATT: vfnmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfnmsub213ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub213ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfnmsub213ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xae,0x31 + +# ATT: vfnmsub213ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfnmsub213ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xae,0x71,0x7f + +# ATT: vfnmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfnmsub213ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xae,0x72,0x80 + +# ATT: vfnmsub213sh %xmm28, %xmm29, %xmm30 +# INTEL: vfnmsub213sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0xaf,0xf4 + +# ATT: vfnmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfnmsub213sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xaf,0xf4 + +# ATT: vfnmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfnmsub213sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0xaf,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub213sh (%r9), %xmm29, %xmm30 +# INTEL: vfnmsub213sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0xaf,0x31 + +# ATT: vfnmsub213sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfnmsub213sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0xaf,0x71,0x7f + +# ATT: vfnmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfnmsub213sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0xaf,0x72,0x80 + +# ATT: vfnmsub231ph %zmm28, %zmm29, %zmm30 +# INTEL: vfnmsub231ph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0xbe,0xf4 + +# ATT: vfnmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vfnmsub231ph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xbe,0xf4 + +# ATT: vfnmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vfnmsub231ph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub231ph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vfnmsub231ph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0xbe,0x31 + +# ATT: vfnmsub231ph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vfnmsub231ph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0xbe,0x71,0x7f + +# ATT: vfnmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vfnmsub231ph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0xbe,0x72,0x80 + +# ATT: vfnmsub231sh %xmm28, %xmm29, %xmm30 +# INTEL: vfnmsub231sh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0xbf,0xf4 + +# ATT: vfnmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vfnmsub231sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0xbf,0xf4 + +# ATT: vfnmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vfnmsub231sh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0xbf,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub231sh (%r9), %xmm29, %xmm30 +# INTEL: vfnmsub231sh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0xbf,0x31 + +# ATT: vfnmsub231sh 254(%rcx), %xmm29, %xmm30 +# INTEL: vfnmsub231sh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0xbf,0x71,0x7f + +# ATT: vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vfnmsub231sh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0xbf,0x72,0x80 diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt --- a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt @@ -1492,3 +1492,723 @@ # ATT: vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z} # INTEL: vsqrtph ymm6 {k7} {z}, word ptr [edx - 256]{1to16} 0x62,0xf5,0x7c,0xbf,0x51,0x72,0x80 + +# ATT: vfmadd132ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmadd132ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0x98,0xf4 + +# ATT: vfmadd132ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmadd132ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0x98,0xf4 + +# ATT: vfmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmadd132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0x98,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd132ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmadd132ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0x98,0x31 + +# ATT: vfmadd132ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmadd132ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0x98,0x71,0x7f + +# ATT: vfmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmadd132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0x98,0x72,0x80 + +# ATT: vfmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmadd132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0x98,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd132ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmadd132ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0x98,0x31 + +# ATT: vfmadd132ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmadd132ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0x98,0x71,0x7f + +# ATT: vfmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmadd132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0x98,0x72,0x80 + +# ATT: vfmadd213ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmadd213ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xa8,0xf4 + +# ATT: vfmadd213ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmadd213ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xa8,0xf4 + +# ATT: vfmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmadd213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xa8,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd213ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmadd213ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xa8,0x31 + +# ATT: vfmadd213ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmadd213ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xa8,0x71,0x7f + +# ATT: vfmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmadd213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xa8,0x72,0x80 + +# ATT: vfmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmadd213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xa8,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd213ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmadd213ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xa8,0x31 + +# ATT: vfmadd213ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmadd213ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xa8,0x71,0x7f + +# ATT: vfmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmadd213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xa8,0x72,0x80 + +# ATT: vfmadd231ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmadd231ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xb8,0xf4 + +# ATT: vfmadd231ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmadd231ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xb8,0xf4 + +# ATT: vfmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmadd231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xb8,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd231ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmadd231ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xb8,0x31 + +# ATT: vfmadd231ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmadd231ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xb8,0x71,0x7f + +# ATT: vfmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmadd231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xb8,0x72,0x80 + +# ATT: vfmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmadd231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xb8,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmadd231ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmadd231ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xb8,0x31 + +# ATT: vfmadd231ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmadd231ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xb8,0x71,0x7f + +# ATT: vfmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmadd231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xb8,0x72,0x80 + +# ATT: vfmaddsub132ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmaddsub132ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0x96,0xf4 + +# ATT: vfmaddsub132ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmaddsub132ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0x96,0xf4 + +# ATT: vfmaddsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmaddsub132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0x96,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmaddsub132ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmaddsub132ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0x96,0x31 + +# ATT: vfmaddsub132ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmaddsub132ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0x96,0x71,0x7f + +# ATT: vfmaddsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmaddsub132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0x96,0x72,0x80 + +# ATT: vfmaddsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmaddsub132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0x96,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmaddsub132ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmaddsub132ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0x96,0x31 + +# ATT: vfmaddsub132ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmaddsub132ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0x96,0x71,0x7f + +# ATT: vfmaddsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmaddsub132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0x96,0x72,0x80 + +# ATT: vfmaddsub213ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmaddsub213ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xa6,0xf4 + +# ATT: vfmaddsub213ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmaddsub213ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xa6,0xf4 + +# ATT: vfmaddsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmaddsub213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xa6,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmaddsub213ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmaddsub213ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xa6,0x31 + +# ATT: vfmaddsub213ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmaddsub213ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xa6,0x71,0x7f + +# ATT: vfmaddsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmaddsub213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xa6,0x72,0x80 + +# ATT: vfmaddsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmaddsub213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xa6,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmaddsub213ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmaddsub213ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xa6,0x31 + +# ATT: vfmaddsub213ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmaddsub213ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xa6,0x71,0x7f + +# ATT: vfmaddsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmaddsub213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xa6,0x72,0x80 + +# ATT: vfmaddsub231ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmaddsub231ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xb6,0xf4 + +# ATT: vfmaddsub231ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmaddsub231ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xb6,0xf4 + +# ATT: vfmaddsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmaddsub231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xb6,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmaddsub231ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmaddsub231ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xb6,0x31 + +# ATT: vfmaddsub231ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmaddsub231ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xb6,0x71,0x7f + +# ATT: vfmaddsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmaddsub231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xb6,0x72,0x80 + +# ATT: vfmaddsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmaddsub231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xb6,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmaddsub231ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmaddsub231ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xb6,0x31 + +# ATT: vfmaddsub231ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmaddsub231ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xb6,0x71,0x7f + +# ATT: vfmaddsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmaddsub231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xb6,0x72,0x80 + +# ATT: vfmsub132ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmsub132ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0x9a,0xf4 + +# ATT: vfmsub132ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmsub132ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0x9a,0xf4 + +# ATT: vfmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmsub132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0x9a,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub132ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmsub132ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0x9a,0x31 + +# ATT: vfmsub132ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmsub132ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0x9a,0x71,0x7f + +# ATT: vfmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmsub132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0x9a,0x72,0x80 + +# ATT: vfmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmsub132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0x9a,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub132ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmsub132ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0x9a,0x31 + +# ATT: vfmsub132ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmsub132ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0x9a,0x71,0x7f + +# ATT: vfmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmsub132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0x9a,0x72,0x80 + +# ATT: vfmsub213ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmsub213ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xaa,0xf4 + +# ATT: vfmsub213ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmsub213ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xaa,0xf4 + +# ATT: vfmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmsub213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xaa,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub213ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmsub213ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xaa,0x31 + +# ATT: vfmsub213ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmsub213ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xaa,0x71,0x7f + +# ATT: vfmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmsub213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xaa,0x72,0x80 + +# ATT: vfmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmsub213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xaa,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub213ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmsub213ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xaa,0x31 + +# ATT: vfmsub213ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmsub213ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xaa,0x71,0x7f + +# ATT: vfmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmsub213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xaa,0x72,0x80 + +# ATT: vfmsub231ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmsub231ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xba,0xf4 + +# ATT: vfmsub231ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmsub231ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xba,0xf4 + +# ATT: vfmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmsub231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xba,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub231ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmsub231ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xba,0x31 + +# ATT: vfmsub231ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmsub231ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xba,0x71,0x7f + +# ATT: vfmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmsub231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xba,0x72,0x80 + +# ATT: vfmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmsub231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xba,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsub231ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmsub231ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xba,0x31 + +# ATT: vfmsub231ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmsub231ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xba,0x71,0x7f + +# ATT: vfmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmsub231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xba,0x72,0x80 + +# ATT: vfmsubadd132ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmsubadd132ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0x97,0xf4 + +# ATT: vfmsubadd132ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmsubadd132ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0x97,0xf4 + +# ATT: vfmsubadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmsubadd132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0x97,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsubadd132ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmsubadd132ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0x97,0x31 + +# ATT: vfmsubadd132ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmsubadd132ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0x97,0x71,0x7f + +# ATT: vfmsubadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmsubadd132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0x97,0x72,0x80 + +# ATT: vfmsubadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmsubadd132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0x97,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsubadd132ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmsubadd132ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0x97,0x31 + +# ATT: vfmsubadd132ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmsubadd132ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0x97,0x71,0x7f + +# ATT: vfmsubadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmsubadd132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0x97,0x72,0x80 + +# ATT: vfmsubadd213ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmsubadd213ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xa7,0xf4 + +# ATT: vfmsubadd213ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmsubadd213ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xa7,0xf4 + +# ATT: vfmsubadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmsubadd213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xa7,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsubadd213ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmsubadd213ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xa7,0x31 + +# ATT: vfmsubadd213ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmsubadd213ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xa7,0x71,0x7f + +# ATT: vfmsubadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmsubadd213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xa7,0x72,0x80 + +# ATT: vfmsubadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmsubadd213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xa7,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsubadd213ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmsubadd213ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xa7,0x31 + +# ATT: vfmsubadd213ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmsubadd213ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xa7,0x71,0x7f + +# ATT: vfmsubadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmsubadd213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xa7,0x72,0x80 + +# ATT: vfmsubadd231ph %ymm4, %ymm5, %ymm6 +# INTEL: vfmsubadd231ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xb7,0xf4 + +# ATT: vfmsubadd231ph %xmm4, %xmm5, %xmm6 +# INTEL: vfmsubadd231ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xb7,0xf4 + +# ATT: vfmsubadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfmsubadd231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xb7,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsubadd231ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfmsubadd231ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xb7,0x31 + +# ATT: vfmsubadd231ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfmsubadd231ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xb7,0x71,0x7f + +# ATT: vfmsubadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfmsubadd231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xb7,0x72,0x80 + +# ATT: vfmsubadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfmsubadd231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xb7,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfmsubadd231ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfmsubadd231ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xb7,0x31 + +# ATT: vfmsubadd231ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfmsubadd231ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xb7,0x71,0x7f + +# ATT: vfmsubadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfmsubadd231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xb7,0x72,0x80 + +# ATT: vfnmadd132ph %ymm4, %ymm5, %ymm6 +# INTEL: vfnmadd132ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0x9c,0xf4 + +# ATT: vfnmadd132ph %xmm4, %xmm5, %xmm6 +# INTEL: vfnmadd132ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0x9c,0xf4 + +# ATT: vfnmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfnmadd132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0x9c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd132ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfnmadd132ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0x9c,0x31 + +# ATT: vfnmadd132ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfnmadd132ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0x9c,0x71,0x7f + +# ATT: vfnmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfnmadd132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0x9c,0x72,0x80 + +# ATT: vfnmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfnmadd132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0x9c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd132ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfnmadd132ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0x9c,0x31 + +# ATT: vfnmadd132ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfnmadd132ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0x9c,0x71,0x7f + +# ATT: vfnmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfnmadd132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0x9c,0x72,0x80 + +# ATT: vfnmadd213ph %ymm4, %ymm5, %ymm6 +# INTEL: vfnmadd213ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xac,0xf4 + +# ATT: vfnmadd213ph %xmm4, %xmm5, %xmm6 +# INTEL: vfnmadd213ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xac,0xf4 + +# ATT: vfnmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfnmadd213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xac,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd213ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfnmadd213ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xac,0x31 + +# ATT: vfnmadd213ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfnmadd213ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xac,0x71,0x7f + +# ATT: vfnmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfnmadd213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xac,0x72,0x80 + +# ATT: vfnmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfnmadd213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xac,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd213ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfnmadd213ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xac,0x31 + +# ATT: vfnmadd213ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfnmadd213ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xac,0x71,0x7f + +# ATT: vfnmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfnmadd213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xac,0x72,0x80 + +# ATT: vfnmadd231ph %ymm4, %ymm5, %ymm6 +# INTEL: vfnmadd231ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xbc,0xf4 + +# ATT: vfnmadd231ph %xmm4, %xmm5, %xmm6 +# INTEL: vfnmadd231ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xbc,0xf4 + +# ATT: vfnmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfnmadd231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xbc,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd231ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfnmadd231ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xbc,0x31 + +# ATT: vfnmadd231ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfnmadd231ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xbc,0x71,0x7f + +# ATT: vfnmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfnmadd231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xbc,0x72,0x80 + +# ATT: vfnmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfnmadd231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xbc,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmadd231ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfnmadd231ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xbc,0x31 + +# ATT: vfnmadd231ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfnmadd231ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xbc,0x71,0x7f + +# ATT: vfnmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfnmadd231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xbc,0x72,0x80 + +# ATT: vfnmsub132ph %ymm4, %ymm5, %ymm6 +# INTEL: vfnmsub132ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0x9e,0xf4 + +# ATT: vfnmsub132ph %xmm4, %xmm5, %xmm6 +# INTEL: vfnmsub132ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0x9e,0xf4 + +# ATT: vfnmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfnmsub132ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0x9e,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub132ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfnmsub132ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0x9e,0x31 + +# ATT: vfnmsub132ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfnmsub132ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0x9e,0x71,0x7f + +# ATT: vfnmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfnmsub132ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0x9e,0x72,0x80 + +# ATT: vfnmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfnmsub132ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0x9e,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub132ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfnmsub132ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0x9e,0x31 + +# ATT: vfnmsub132ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfnmsub132ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0x9e,0x71,0x7f + +# ATT: vfnmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfnmsub132ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0x9e,0x72,0x80 + +# ATT: vfnmsub213ph %ymm4, %ymm5, %ymm6 +# INTEL: vfnmsub213ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xae,0xf4 + +# ATT: vfnmsub213ph %xmm4, %xmm5, %xmm6 +# INTEL: vfnmsub213ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xae,0xf4 + +# ATT: vfnmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfnmsub213ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xae,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub213ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfnmsub213ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xae,0x31 + +# ATT: vfnmsub213ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfnmsub213ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xae,0x71,0x7f + +# ATT: vfnmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfnmsub213ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xae,0x72,0x80 + +# ATT: vfnmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfnmsub213ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xae,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub213ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfnmsub213ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xae,0x31 + +# ATT: vfnmsub213ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfnmsub213ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xae,0x71,0x7f + +# ATT: vfnmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfnmsub213ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xae,0x72,0x80 + +# ATT: vfnmsub231ph %ymm4, %ymm5, %ymm6 +# INTEL: vfnmsub231ph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0xbe,0xf4 + +# ATT: vfnmsub231ph %xmm4, %xmm5, %xmm6 +# INTEL: vfnmsub231ph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0xbe,0xf4 + +# ATT: vfnmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vfnmsub231ph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0xbe,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub231ph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vfnmsub231ph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0xbe,0x31 + +# ATT: vfnmsub231ph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vfnmsub231ph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0xbe,0x71,0x7f + +# ATT: vfnmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vfnmsub231ph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0xbe,0x72,0x80 + +# ATT: vfnmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vfnmsub231ph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0xbe,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vfnmsub231ph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vfnmsub231ph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0xbe,0x31 + +# ATT: vfnmsub231ph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vfnmsub231ph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0xbe,0x71,0x7f + +# ATT: vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vfnmsub231ph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0xbe,0x72,0x80 diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s --- a/llvm/test/MC/X86/avx512fp16.s +++ b/llvm/test/MC/X86/avx512fp16.s @@ -1763,3 +1763,723 @@ // CHECK: vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} // CHECK: encoding: [0x62,0x65,0x16,0x87,0x51,0x72,0x80] vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfmadd132ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0x98,0xf4] + vfmadd132ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x98,0xf4] + vfmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmadd132ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0x98,0x31] + vfmadd132ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmadd132ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0x98,0x71,0x7f] + vfmadd132ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x98,0x72,0x80] + vfmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmadd132sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x99,0xf4] + vfmadd132sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x99,0xf4] + vfmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x99,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfmadd132sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0x99,0x31] + vfmadd132sh (%r9), %xmm29, %xmm30 + +// CHECK: vfmadd132sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x99,0x71,0x7f] + vfmadd132sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0x99,0x72,0x80] + vfmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfmadd213ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xa8,0xf4] + vfmadd213ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xa8,0xf4] + vfmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmadd213ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xa8,0x31] + vfmadd213ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmadd213ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xa8,0x71,0x7f] + vfmadd213ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xa8,0x72,0x80] + vfmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmadd213sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xa9,0xf4] + vfmadd213sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xa9,0xf4] + vfmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xa9,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfmadd213sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0xa9,0x31] + vfmadd213sh (%r9), %xmm29, %xmm30 + +// CHECK: vfmadd213sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xa9,0x71,0x7f] + vfmadd213sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0xa9,0x72,0x80] + vfmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfmadd231ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xb8,0xf4] + vfmadd231ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xb8,0xf4] + vfmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmadd231ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xb8,0x31] + vfmadd231ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmadd231ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xb8,0x71,0x7f] + vfmadd231ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xb8,0x72,0x80] + vfmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmadd231sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xb9,0xf4] + vfmadd231sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xb9,0xf4] + vfmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xb9,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfmadd231sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0xb9,0x31] + vfmadd231sh (%r9), %xmm29, %xmm30 + +// CHECK: vfmadd231sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xb9,0x71,0x7f] + vfmadd231sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0xb9,0x72,0x80] + vfmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfmaddsub132ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0x96,0xf4] + vfmaddsub132ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmaddsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x96,0xf4] + vfmaddsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmaddsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0x96,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmaddsub132ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0x96,0x31] + vfmaddsub132ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmaddsub132ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0x96,0x71,0x7f] + vfmaddsub132ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmaddsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x96,0x72,0x80] + vfmaddsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmaddsub213ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xa6,0xf4] + vfmaddsub213ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmaddsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xa6,0xf4] + vfmaddsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmaddsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xa6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmaddsub213ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xa6,0x31] + vfmaddsub213ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmaddsub213ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xa6,0x71,0x7f] + vfmaddsub213ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmaddsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xa6,0x72,0x80] + vfmaddsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmaddsub231ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xb6,0xf4] + vfmaddsub231ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmaddsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xb6,0xf4] + vfmaddsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmaddsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xb6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmaddsub231ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xb6,0x31] + vfmaddsub231ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmaddsub231ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xb6,0x71,0x7f] + vfmaddsub231ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmaddsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xb6,0x72,0x80] + vfmaddsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmsub132ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0x9a,0xf4] + vfmsub132ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9a,0xf4] + vfmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmsub132ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0x9a,0x31] + vfmsub132ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmsub132ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0x9a,0x71,0x7f] + vfmsub132ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x9a,0x72,0x80] + vfmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmsub132sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9b,0xf4] + vfmsub132sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9b,0xf4] + vfmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfmsub132sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0x9b,0x31] + vfmsub132sh (%r9), %xmm29, %xmm30 + +// CHECK: vfmsub132sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9b,0x71,0x7f] + vfmsub132sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0x9b,0x72,0x80] + vfmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfmsub213ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xaa,0xf4] + vfmsub213ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xaa,0xf4] + vfmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmsub213ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xaa,0x31] + vfmsub213ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmsub213ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xaa,0x71,0x7f] + vfmsub213ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xaa,0x72,0x80] + vfmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmsub213sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xab,0xf4] + vfmsub213sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xab,0xf4] + vfmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xab,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfmsub213sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0xab,0x31] + vfmsub213sh (%r9), %xmm29, %xmm30 + +// CHECK: vfmsub213sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xab,0x71,0x7f] + vfmsub213sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0xab,0x72,0x80] + vfmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfmsub231ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xba,0xf4] + vfmsub231ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xba,0xf4] + vfmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmsub231ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xba,0x31] + vfmsub231ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmsub231ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xba,0x71,0x7f] + vfmsub231ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xba,0x72,0x80] + vfmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmsub231sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xbb,0xf4] + vfmsub231sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xbb,0xf4] + vfmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xbb,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfmsub231sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0xbb,0x31] + vfmsub231sh (%r9), %xmm29, %xmm30 + +// CHECK: vfmsub231sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xbb,0x71,0x7f] + vfmsub231sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0xbb,0x72,0x80] + vfmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfmsubadd132ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0x97,0xf4] + vfmsubadd132ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsubadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x97,0xf4] + vfmsubadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsubadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0x97,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsubadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmsubadd132ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0x97,0x31] + vfmsubadd132ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmsubadd132ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0x97,0x71,0x7f] + vfmsubadd132ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmsubadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x97,0x72,0x80] + vfmsubadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmsubadd213ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xa7,0xf4] + vfmsubadd213ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsubadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xa7,0xf4] + vfmsubadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsubadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xa7,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsubadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmsubadd213ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xa7,0x31] + vfmsubadd213ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmsubadd213ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xa7,0x71,0x7f] + vfmsubadd213ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmsubadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xa7,0x72,0x80] + vfmsubadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfmsubadd231ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xb7,0xf4] + vfmsubadd231ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsubadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xb7,0xf4] + vfmsubadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfmsubadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xb7,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsubadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfmsubadd231ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xb7,0x31] + vfmsubadd231ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfmsubadd231ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xb7,0x71,0x7f] + vfmsubadd231ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfmsubadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xb7,0x72,0x80] + vfmsubadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfnmadd132ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0x9c,0xf4] + vfnmadd132ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9c,0xf4] + vfnmadd132ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfnmadd132ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0x9c,0x31] + vfnmadd132ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfnmadd132ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0x9c,0x71,0x7f] + vfnmadd132ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfnmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x9c,0x72,0x80] + vfnmadd132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfnmadd132sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9d,0xf4] + vfnmadd132sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9d,0xf4] + vfnmadd132sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfnmadd132sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0x9d,0x31] + vfnmadd132sh (%r9), %xmm29, %xmm30 + +// CHECK: vfnmadd132sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9d,0x71,0x7f] + vfnmadd132sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfnmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0x9d,0x72,0x80] + vfnmadd132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfnmadd213ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xac,0xf4] + vfnmadd213ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xac,0xf4] + vfnmadd213ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfnmadd213ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xac,0x31] + vfnmadd213ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfnmadd213ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xac,0x71,0x7f] + vfnmadd213ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfnmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xac,0x72,0x80] + vfnmadd213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfnmadd213sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xad,0xf4] + vfnmadd213sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xad,0xf4] + vfnmadd213sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xad,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfnmadd213sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0xad,0x31] + vfnmadd213sh (%r9), %xmm29, %xmm30 + +// CHECK: vfnmadd213sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xad,0x71,0x7f] + vfnmadd213sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfnmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0xad,0x72,0x80] + vfnmadd213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfnmadd231ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xbc,0xf4] + vfnmadd231ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xbc,0xf4] + vfnmadd231ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfnmadd231ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xbc,0x31] + vfnmadd231ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfnmadd231ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xbc,0x71,0x7f] + vfnmadd231ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfnmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xbc,0x72,0x80] + vfnmadd231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfnmadd231sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xbd,0xf4] + vfnmadd231sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xbd,0xf4] + vfnmadd231sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xbd,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfnmadd231sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0xbd,0x31] + vfnmadd231sh (%r9), %xmm29, %xmm30 + +// CHECK: vfnmadd231sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xbd,0x71,0x7f] + vfnmadd231sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfnmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0xbd,0x72,0x80] + vfnmadd231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfnmsub132ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0x9e,0xf4] + vfnmsub132ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9e,0xf4] + vfnmsub132ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub132ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfnmsub132ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0x9e,0x31] + vfnmsub132ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfnmsub132ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0x9e,0x71,0x7f] + vfnmsub132ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfnmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x9e,0x72,0x80] + vfnmsub132ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfnmsub132sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9f,0xf4] + vfnmsub132sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x9f,0xf4] + vfnmsub132sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub132sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfnmsub132sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0x9f,0x31] + vfnmsub132sh (%r9), %xmm29, %xmm30 + +// CHECK: vfnmsub132sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9f,0x71,0x7f] + vfnmsub132sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfnmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0x9f,0x72,0x80] + vfnmsub132sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfnmsub213ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xae,0xf4] + vfnmsub213ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xae,0xf4] + vfnmsub213ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub213ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfnmsub213ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xae,0x31] + vfnmsub213ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfnmsub213ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xae,0x71,0x7f] + vfnmsub213ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfnmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xae,0x72,0x80] + vfnmsub213ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfnmsub213sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xaf,0xf4] + vfnmsub213sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xaf,0xf4] + vfnmsub213sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xaf,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub213sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfnmsub213sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0xaf,0x31] + vfnmsub213sh (%r9), %xmm29, %xmm30 + +// CHECK: vfnmsub213sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xaf,0x71,0x7f] + vfnmsub213sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfnmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0xaf,0x72,0x80] + vfnmsub213sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vfnmsub231ph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0xbe,0xf4] + vfnmsub231ph %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xbe,0xf4] + vfnmsub231ph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vfnmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub231ph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vfnmsub231ph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0xbe,0x31] + vfnmsub231ph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vfnmsub231ph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0xbe,0x71,0x7f] + vfnmsub231ph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vfnmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0xbe,0x72,0x80] + vfnmsub231ph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vfnmsub231sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xbf,0xf4] + vfnmsub231sh %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0xbf,0xf4] + vfnmsub231sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vfnmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xbf,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub231sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vfnmsub231sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0xbf,0x31] + vfnmsub231sh (%r9), %xmm29, %xmm30 + +// CHECK: vfnmsub231sh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xbf,0x71,0x7f] + vfnmsub231sh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0xbf,0x72,0x80] + vfnmsub231sh -256(%rdx), %xmm29, %xmm30 {%k7} {z} diff --git a/llvm/test/MC/X86/avx512fp16vl.s b/llvm/test/MC/X86/avx512fp16vl.s --- a/llvm/test/MC/X86/avx512fp16vl.s +++ b/llvm/test/MC/X86/avx512fp16vl.s @@ -1491,3 +1491,723 @@ // CHECK: vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x51,0x72,0x80] vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vfmadd132ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x98,0xf4] + vfmadd132ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmadd132ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x98,0xf4] + vfmadd132ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x98,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmadd132ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x98,0x31] + vfmadd132ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmadd132ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x98,0x71,0x7f] + vfmadd132ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x98,0x72,0x80] + vfmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x98,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmadd132ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x98,0x31] + vfmadd132ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmadd132ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x98,0x71,0x7f] + vfmadd132ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x98,0x72,0x80] + vfmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmadd213ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa8,0xf4] + vfmadd213ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmadd213ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa8,0xf4] + vfmadd213ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xa8,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmadd213ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xa8,0x31] + vfmadd213ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmadd213ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa8,0x71,0x7f] + vfmadd213ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xa8,0x72,0x80] + vfmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xa8,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmadd213ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa8,0x31] + vfmadd213ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmadd213ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa8,0x71,0x7f] + vfmadd213ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xa8,0x72,0x80] + vfmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmadd231ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb8,0xf4] + vfmadd231ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmadd231ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb8,0xf4] + vfmadd231ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xb8,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmadd231ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xb8,0x31] + vfmadd231ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmadd231ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb8,0x71,0x7f] + vfmadd231ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xb8,0x72,0x80] + vfmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xb8,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmadd231ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb8,0x31] + vfmadd231ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmadd231ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb8,0x71,0x7f] + vfmadd231ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xb8,0x72,0x80] + vfmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmaddsub132ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x96,0xf4] + vfmaddsub132ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmaddsub132ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x96,0xf4] + vfmaddsub132ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmaddsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x96,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmaddsub132ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x96,0x31] + vfmaddsub132ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmaddsub132ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x96,0x71,0x7f] + vfmaddsub132ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmaddsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x96,0x72,0x80] + vfmaddsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmaddsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x96,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmaddsub132ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x96,0x31] + vfmaddsub132ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmaddsub132ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x96,0x71,0x7f] + vfmaddsub132ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmaddsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x96,0x72,0x80] + vfmaddsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmaddsub213ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa6,0xf4] + vfmaddsub213ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmaddsub213ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa6,0xf4] + vfmaddsub213ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmaddsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xa6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmaddsub213ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xa6,0x31] + vfmaddsub213ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmaddsub213ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa6,0x71,0x7f] + vfmaddsub213ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmaddsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xa6,0x72,0x80] + vfmaddsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmaddsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xa6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmaddsub213ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa6,0x31] + vfmaddsub213ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmaddsub213ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa6,0x71,0x7f] + vfmaddsub213ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmaddsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xa6,0x72,0x80] + vfmaddsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmaddsub231ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb6,0xf4] + vfmaddsub231ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmaddsub231ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb6,0xf4] + vfmaddsub231ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmaddsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xb6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmaddsub231ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xb6,0x31] + vfmaddsub231ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmaddsub231ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb6,0x71,0x7f] + vfmaddsub231ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmaddsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xb6,0x72,0x80] + vfmaddsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmaddsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xb6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmaddsub231ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb6,0x31] + vfmaddsub231ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmaddsub231ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb6,0x71,0x7f] + vfmaddsub231ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmaddsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xb6,0x72,0x80] + vfmaddsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmsub132ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9a,0xf4] + vfmsub132ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmsub132ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9a,0xf4] + vfmsub132ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x9a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmsub132ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x9a,0x31] + vfmsub132ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmsub132ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9a,0x71,0x7f] + vfmsub132ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x9a,0x72,0x80] + vfmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmsub132ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9a,0x31] + vfmsub132ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmsub132ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9a,0x71,0x7f] + vfmsub132ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x9a,0x72,0x80] + vfmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmsub213ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xaa,0xf4] + vfmsub213ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmsub213ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xaa,0xf4] + vfmsub213ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xaa,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmsub213ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xaa,0x31] + vfmsub213ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmsub213ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xaa,0x71,0x7f] + vfmsub213ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xaa,0x72,0x80] + vfmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xaa,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmsub213ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xaa,0x31] + vfmsub213ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmsub213ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xaa,0x71,0x7f] + vfmsub213ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xaa,0x72,0x80] + vfmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmsub231ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xba,0xf4] + vfmsub231ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmsub231ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xba,0xf4] + vfmsub231ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xba,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmsub231ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xba,0x31] + vfmsub231ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmsub231ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xba,0x71,0x7f] + vfmsub231ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xba,0x72,0x80] + vfmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xba,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmsub231ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xba,0x31] + vfmsub231ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmsub231ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xba,0x71,0x7f] + vfmsub231ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xba,0x72,0x80] + vfmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmsubadd132ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x97,0xf4] + vfmsubadd132ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmsubadd132ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x97,0xf4] + vfmsubadd132ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmsubadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x97,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsubadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmsubadd132ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x97,0x31] + vfmsubadd132ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmsubadd132ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x97,0x71,0x7f] + vfmsubadd132ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmsubadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x97,0x72,0x80] + vfmsubadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmsubadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x97,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsubadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmsubadd132ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x97,0x31] + vfmsubadd132ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmsubadd132ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x97,0x71,0x7f] + vfmsubadd132ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmsubadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x97,0x72,0x80] + vfmsubadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmsubadd213ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa7,0xf4] + vfmsubadd213ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmsubadd213ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa7,0xf4] + vfmsubadd213ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmsubadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xa7,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsubadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmsubadd213ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xa7,0x31] + vfmsubadd213ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmsubadd213ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xa7,0x71,0x7f] + vfmsubadd213ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmsubadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xa7,0x72,0x80] + vfmsubadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmsubadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xa7,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsubadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmsubadd213ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa7,0x31] + vfmsubadd213ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmsubadd213ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa7,0x71,0x7f] + vfmsubadd213ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmsubadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xa7,0x72,0x80] + vfmsubadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfmsubadd231ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb7,0xf4] + vfmsubadd231ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfmsubadd231ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb7,0xf4] + vfmsubadd231ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfmsubadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xb7,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsubadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfmsubadd231ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xb7,0x31] + vfmsubadd231ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfmsubadd231ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xb7,0x71,0x7f] + vfmsubadd231ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfmsubadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xb7,0x72,0x80] + vfmsubadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfmsubadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xb7,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsubadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfmsubadd231ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb7,0x31] + vfmsubadd231ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfmsubadd231ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb7,0x71,0x7f] + vfmsubadd231ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfmsubadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xb7,0x72,0x80] + vfmsubadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfnmadd132ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9c,0xf4] + vfnmadd132ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfnmadd132ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9c,0xf4] + vfnmadd132ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfnmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x9c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfnmadd132ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x9c,0x31] + vfnmadd132ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfnmadd132ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9c,0x71,0x7f] + vfnmadd132ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfnmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x9c,0x72,0x80] + vfnmadd132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfnmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfnmadd132ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9c,0x31] + vfnmadd132ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfnmadd132ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9c,0x71,0x7f] + vfnmadd132ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfnmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x9c,0x72,0x80] + vfnmadd132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfnmadd213ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xac,0xf4] + vfnmadd213ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfnmadd213ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xac,0xf4] + vfnmadd213ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfnmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xac,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfnmadd213ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xac,0x31] + vfnmadd213ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfnmadd213ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xac,0x71,0x7f] + vfnmadd213ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfnmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xac,0x72,0x80] + vfnmadd213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfnmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xac,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfnmadd213ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xac,0x31] + vfnmadd213ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfnmadd213ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xac,0x71,0x7f] + vfnmadd213ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfnmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xac,0x72,0x80] + vfnmadd213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfnmadd231ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xbc,0xf4] + vfnmadd231ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfnmadd231ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbc,0xf4] + vfnmadd231ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfnmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xbc,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfnmadd231ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xbc,0x31] + vfnmadd231ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfnmadd231ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xbc,0x71,0x7f] + vfnmadd231ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfnmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xbc,0x72,0x80] + vfnmadd231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfnmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xbc,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfnmadd231ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbc,0x31] + vfnmadd231ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfnmadd231ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbc,0x71,0x7f] + vfnmadd231ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfnmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xbc,0x72,0x80] + vfnmadd231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfnmsub132ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9e,0xf4] + vfnmsub132ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfnmsub132ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9e,0xf4] + vfnmsub132ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfnmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x9e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub132ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfnmsub132ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x9e,0x31] + vfnmsub132ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfnmsub132ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x9e,0x71,0x7f] + vfnmsub132ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfnmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x9e,0x72,0x80] + vfnmsub132ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfnmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub132ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfnmsub132ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9e,0x31] + vfnmsub132ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfnmsub132ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9e,0x71,0x7f] + vfnmsub132ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfnmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x9e,0x72,0x80] + vfnmsub132ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfnmsub213ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xae,0xf4] + vfnmsub213ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfnmsub213ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xae,0xf4] + vfnmsub213ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfnmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xae,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub213ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfnmsub213ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xae,0x31] + vfnmsub213ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfnmsub213ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xae,0x71,0x7f] + vfnmsub213ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfnmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xae,0x72,0x80] + vfnmsub213ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfnmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xae,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub213ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfnmsub213ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xae,0x31] + vfnmsub213ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfnmsub213ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xae,0x71,0x7f] + vfnmsub213ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfnmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xae,0x72,0x80] + vfnmsub213ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vfnmsub231ph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xbe,0xf4] + vfnmsub231ph %ymm4, %ymm5, %ymm6 + +// CHECK: vfnmsub231ph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbe,0xf4] + vfnmsub231ph %xmm4, %xmm5, %xmm6 + +// CHECK: vfnmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0xbe,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub231ph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vfnmsub231ph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0xbe,0x31] + vfnmsub231ph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vfnmsub231ph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0xbe,0x71,0x7f] + vfnmsub231ph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vfnmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0xbe,0x72,0x80] + vfnmsub231ph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vfnmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xbe,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub231ph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vfnmsub231ph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbe,0x31] + vfnmsub231ph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vfnmsub231ph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbe,0x71,0x7f] + vfnmsub231ph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0xbe,0x72,0x80] + vfnmsub231ph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s --- a/llvm/test/MC/X86/intel-syntax-avx512fp16.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s @@ -1635,3 +1635,723 @@ // CHECK: vsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] // CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x51,0x72,0x80] vsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfmadd132ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x98,0xf4] + vfmadd132ph zmm6, zmm5, zmm4 + +// CHECK: vfmadd132ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x98,0xf4] + vfmadd132ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x98,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd132ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x98,0x31] + vfmadd132ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x98,0x71,0x7f] + vfmadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x98,0x72,0x80] + vfmadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfmadd132sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x99,0xf4] + vfmadd132sh xmm6, xmm5, xmm4 + +// CHECK: vfmadd132sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x99,0xf4] + vfmadd132sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfmadd132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x99,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd132sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x99,0x31] + vfmadd132sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfmadd132sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x99,0x71,0x7f] + vfmadd132sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfmadd132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x99,0x72,0x80] + vfmadd132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfmadd213ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa8,0xf4] + vfmadd213ph zmm6, zmm5, zmm4 + +// CHECK: vfmadd213ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa8,0xf4] + vfmadd213ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xa8,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd213ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xa8,0x31] + vfmadd213ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa8,0x71,0x7f] + vfmadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xa8,0x72,0x80] + vfmadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfmadd213sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa9,0xf4] + vfmadd213sh xmm6, xmm5, xmm4 + +// CHECK: vfmadd213sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa9,0xf4] + vfmadd213sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfmadd213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xa9,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd213sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa9,0x31] + vfmadd213sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfmadd213sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xa9,0x71,0x7f] + vfmadd213sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfmadd213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xa9,0x72,0x80] + vfmadd213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfmadd231ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb8,0xf4] + vfmadd231ph zmm6, zmm5, zmm4 + +// CHECK: vfmadd231ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb8,0xf4] + vfmadd231ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xb8,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd231ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xb8,0x31] + vfmadd231ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb8,0x71,0x7f] + vfmadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xb8,0x72,0x80] + vfmadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfmadd231sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb9,0xf4] + vfmadd231sh xmm6, xmm5, xmm4 + +// CHECK: vfmadd231sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb9,0xf4] + vfmadd231sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfmadd231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xb9,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmadd231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfmadd231sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb9,0x31] + vfmadd231sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfmadd231sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xb9,0x71,0x7f] + vfmadd231sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfmadd231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xb9,0x72,0x80] + vfmadd231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfmaddsub132ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x96,0xf4] + vfmaddsub132ph zmm6, zmm5, zmm4 + +// CHECK: vfmaddsub132ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x96,0xf4] + vfmaddsub132ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmaddsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x96,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmaddsub132ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x96,0x31] + vfmaddsub132ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmaddsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x96,0x71,0x7f] + vfmaddsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmaddsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x96,0x72,0x80] + vfmaddsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfmaddsub213ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa6,0xf4] + vfmaddsub213ph zmm6, zmm5, zmm4 + +// CHECK: vfmaddsub213ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa6,0xf4] + vfmaddsub213ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmaddsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xa6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmaddsub213ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xa6,0x31] + vfmaddsub213ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmaddsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa6,0x71,0x7f] + vfmaddsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmaddsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xa6,0x72,0x80] + vfmaddsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfmaddsub231ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb6,0xf4] + vfmaddsub231ph zmm6, zmm5, zmm4 + +// CHECK: vfmaddsub231ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb6,0xf4] + vfmaddsub231ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmaddsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xb6,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmaddsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmaddsub231ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xb6,0x31] + vfmaddsub231ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmaddsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb6,0x71,0x7f] + vfmaddsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmaddsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xb6,0x72,0x80] + vfmaddsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfmsub132ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9a,0xf4] + vfmsub132ph zmm6, zmm5, zmm4 + +// CHECK: vfmsub132ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9a,0xf4] + vfmsub132ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x9a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub132ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x9a,0x31] + vfmsub132ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9a,0x71,0x7f] + vfmsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x9a,0x72,0x80] + vfmsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfmsub132sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9b,0xf4] + vfmsub132sh xmm6, xmm5, xmm4 + +// CHECK: vfmsub132sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9b,0xf4] + vfmsub132sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfmsub132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub132sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9b,0x31] + vfmsub132sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfmsub132sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9b,0x71,0x7f] + vfmsub132sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfmsub132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x9b,0x72,0x80] + vfmsub132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfmsub213ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xaa,0xf4] + vfmsub213ph zmm6, zmm5, zmm4 + +// CHECK: vfmsub213ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xaa,0xf4] + vfmsub213ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xaa,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub213ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xaa,0x31] + vfmsub213ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xaa,0x71,0x7f] + vfmsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xaa,0x72,0x80] + vfmsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfmsub213sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xab,0xf4] + vfmsub213sh xmm6, xmm5, xmm4 + +// CHECK: vfmsub213sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xab,0xf4] + vfmsub213sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfmsub213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xab,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub213sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xab,0x31] + vfmsub213sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfmsub213sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xab,0x71,0x7f] + vfmsub213sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfmsub213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xab,0x72,0x80] + vfmsub213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfmsub231ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xba,0xf4] + vfmsub231ph zmm6, zmm5, zmm4 + +// CHECK: vfmsub231ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xba,0xf4] + vfmsub231ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xba,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub231ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xba,0x31] + vfmsub231ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xba,0x71,0x7f] + vfmsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xba,0x72,0x80] + vfmsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfmsub231sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbb,0xf4] + vfmsub231sh xmm6, xmm5, xmm4 + +// CHECK: vfmsub231sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbb,0xf4] + vfmsub231sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfmsub231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xbb,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsub231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsub231sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbb,0x31] + vfmsub231sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfmsub231sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbb,0x71,0x7f] + vfmsub231sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xbb,0x72,0x80] + vfmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfmsubadd132ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x97,0xf4] + vfmsubadd132ph zmm6, zmm5, zmm4 + +// CHECK: vfmsubadd132ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x97,0xf4] + vfmsubadd132ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmsubadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x97,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsubadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsubadd132ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x97,0x31] + vfmsubadd132ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmsubadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x97,0x71,0x7f] + vfmsubadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmsubadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x97,0x72,0x80] + vfmsubadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfmsubadd213ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa7,0xf4] + vfmsubadd213ph zmm6, zmm5, zmm4 + +// CHECK: vfmsubadd213ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xa7,0xf4] + vfmsubadd213ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmsubadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xa7,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsubadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsubadd213ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xa7,0x31] + vfmsubadd213ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmsubadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xa7,0x71,0x7f] + vfmsubadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmsubadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xa7,0x72,0x80] + vfmsubadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfmsubadd231ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb7,0xf4] + vfmsubadd231ph zmm6, zmm5, zmm4 + +// CHECK: vfmsubadd231ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xb7,0xf4] + vfmsubadd231ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfmsubadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xb7,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfmsubadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfmsubadd231ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xb7,0x31] + vfmsubadd231ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfmsubadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xb7,0x71,0x7f] + vfmsubadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfmsubadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xb7,0x72,0x80] + vfmsubadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfnmadd132ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9c,0xf4] + vfnmadd132ph zmm6, zmm5, zmm4 + +// CHECK: vfnmadd132ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9c,0xf4] + vfnmadd132ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfnmadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x9c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd132ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x9c,0x31] + vfnmadd132ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfnmadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9c,0x71,0x7f] + vfnmadd132ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfnmadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x9c,0x72,0x80] + vfnmadd132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfnmadd132sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9d,0xf4] + vfnmadd132sh xmm6, xmm5, xmm4 + +// CHECK: vfnmadd132sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9d,0xf4] + vfnmadd132sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfnmadd132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd132sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9d,0x31] + vfnmadd132sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfnmadd132sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9d,0x71,0x7f] + vfnmadd132sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfnmadd132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x9d,0x72,0x80] + vfnmadd132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfnmadd213ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xac,0xf4] + vfnmadd213ph zmm6, zmm5, zmm4 + +// CHECK: vfnmadd213ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xac,0xf4] + vfnmadd213ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfnmadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xac,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd213ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xac,0x31] + vfnmadd213ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfnmadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xac,0x71,0x7f] + vfnmadd213ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfnmadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xac,0x72,0x80] + vfnmadd213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfnmadd213sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xad,0xf4] + vfnmadd213sh xmm6, xmm5, xmm4 + +// CHECK: vfnmadd213sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xad,0xf4] + vfnmadd213sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfnmadd213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xad,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd213sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xad,0x31] + vfnmadd213sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfnmadd213sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xad,0x71,0x7f] + vfnmadd213sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfnmadd213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xad,0x72,0x80] + vfnmadd213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfnmadd231ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xbc,0xf4] + vfnmadd231ph zmm6, zmm5, zmm4 + +// CHECK: vfnmadd231ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbc,0xf4] + vfnmadd231ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfnmadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xbc,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd231ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xbc,0x31] + vfnmadd231ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfnmadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xbc,0x71,0x7f] + vfnmadd231ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfnmadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xbc,0x72,0x80] + vfnmadd231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfnmadd231sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbd,0xf4] + vfnmadd231sh xmm6, xmm5, xmm4 + +// CHECK: vfnmadd231sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbd,0xf4] + vfnmadd231sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfnmadd231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xbd,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmadd231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmadd231sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbd,0x31] + vfnmadd231sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfnmadd231sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbd,0x71,0x7f] + vfnmadd231sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfnmadd231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xbd,0x72,0x80] + vfnmadd231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfnmsub132ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9e,0xf4] + vfnmsub132ph zmm6, zmm5, zmm4 + +// CHECK: vfnmsub132ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9e,0xf4] + vfnmsub132ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfnmsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x9e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub132ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub132ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x9e,0x31] + vfnmsub132ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfnmsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x9e,0x71,0x7f] + vfnmsub132ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfnmsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x9e,0x72,0x80] + vfnmsub132ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfnmsub132sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9f,0xf4] + vfnmsub132sh xmm6, xmm5, xmm4 + +// CHECK: vfnmsub132sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x9f,0xf4] + vfnmsub132sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfnmsub132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x9f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub132sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub132sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9f,0x31] + vfnmsub132sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfnmsub132sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x9f,0x71,0x7f] + vfnmsub132sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfnmsub132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x9f,0x72,0x80] + vfnmsub132sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfnmsub213ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xae,0xf4] + vfnmsub213ph zmm6, zmm5, zmm4 + +// CHECK: vfnmsub213ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xae,0xf4] + vfnmsub213ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfnmsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xae,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub213ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub213ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xae,0x31] + vfnmsub213ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfnmsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xae,0x71,0x7f] + vfnmsub213ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfnmsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xae,0x72,0x80] + vfnmsub213ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfnmsub213sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xaf,0xf4] + vfnmsub213sh xmm6, xmm5, xmm4 + +// CHECK: vfnmsub213sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xaf,0xf4] + vfnmsub213sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfnmsub213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xaf,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub213sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub213sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xaf,0x31] + vfnmsub213sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfnmsub213sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xaf,0x71,0x7f] + vfnmsub213sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfnmsub213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xaf,0x72,0x80] + vfnmsub213sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vfnmsub231ph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xbe,0xf4] + vfnmsub231ph zmm6, zmm5, zmm4 + +// CHECK: vfnmsub231ph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbe,0xf4] + vfnmsub231ph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vfnmsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0xbe,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub231ph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub231ph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0xbe,0x31] + vfnmsub231ph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vfnmsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0xbe,0x71,0x7f] + vfnmsub231ph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vfnmsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0xbe,0x72,0x80] + vfnmsub231ph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vfnmsub231sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbf,0xf4] + vfnmsub231sh xmm6, xmm5, xmm4 + +// CHECK: vfnmsub231sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0xbf,0xf4] + vfnmsub231sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vfnmsub231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0xbf,0xb4,0xf4,0x00,0x00,0x00,0x10] + vfnmsub231sh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vfnmsub231sh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbf,0x31] + vfnmsub231sh xmm6, xmm5, word ptr [ecx] + +// CHECK: vfnmsub231sh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0xbf,0x71,0x7f] + vfnmsub231sh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vfnmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0xbf,0x72,0x80] + vfnmsub231sh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s --- a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s @@ -1491,3 +1491,723 @@ // CHECK: vsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} // CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x51,0x72,0x80] vsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vfmadd132ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0x98,0xf4] + vfmadd132ph ymm30, ymm29, ymm28 + +// CHECK: vfmadd132ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x98,0xf4] + vfmadd132ph xmm30, xmm29, xmm28 + +// CHECK: vfmadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd132ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0x98,0x31] + vfmadd132ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0x98,0x71,0x7f] + vfmadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x98,0x72,0x80] + vfmadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x98,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd132ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0x98,0x31] + vfmadd132ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x98,0x71,0x7f] + vfmadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0x98,0x72,0x80] + vfmadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfmadd213ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xa8,0xf4] + vfmadd213ph ymm30, ymm29, ymm28 + +// CHECK: vfmadd213ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xa8,0xf4] + vfmadd213ph xmm30, xmm29, xmm28 + +// CHECK: vfmadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd213ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xa8,0x31] + vfmadd213ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xa8,0x71,0x7f] + vfmadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xa8,0x72,0x80] + vfmadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xa8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd213ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xa8,0x31] + vfmadd213ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xa8,0x71,0x7f] + vfmadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xa8,0x72,0x80] + vfmadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfmadd231ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xb8,0xf4] + vfmadd231ph ymm30, ymm29, ymm28 + +// CHECK: vfmadd231ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xb8,0xf4] + vfmadd231ph xmm30, xmm29, xmm28 + +// CHECK: vfmadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd231ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xb8,0x31] + vfmadd231ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xb8,0x71,0x7f] + vfmadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xb8,0x72,0x80] + vfmadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xb8,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmadd231ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xb8,0x31] + vfmadd231ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xb8,0x71,0x7f] + vfmadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xb8,0x72,0x80] + vfmadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfmaddsub132ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0x96,0xf4] + vfmaddsub132ph ymm30, ymm29, ymm28 + +// CHECK: vfmaddsub132ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x96,0xf4] + vfmaddsub132ph xmm30, xmm29, xmm28 + +// CHECK: vfmaddsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0x96,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmaddsub132ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0x96,0x31] + vfmaddsub132ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmaddsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0x96,0x71,0x7f] + vfmaddsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmaddsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x96,0x72,0x80] + vfmaddsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmaddsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x96,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmaddsub132ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0x96,0x31] + vfmaddsub132ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmaddsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x96,0x71,0x7f] + vfmaddsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmaddsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0x96,0x72,0x80] + vfmaddsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfmaddsub213ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xa6,0xf4] + vfmaddsub213ph ymm30, ymm29, ymm28 + +// CHECK: vfmaddsub213ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xa6,0xf4] + vfmaddsub213ph xmm30, xmm29, xmm28 + +// CHECK: vfmaddsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xa6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmaddsub213ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xa6,0x31] + vfmaddsub213ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmaddsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xa6,0x71,0x7f] + vfmaddsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmaddsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xa6,0x72,0x80] + vfmaddsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmaddsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xa6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmaddsub213ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xa6,0x31] + vfmaddsub213ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmaddsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xa6,0x71,0x7f] + vfmaddsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmaddsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xa6,0x72,0x80] + vfmaddsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfmaddsub231ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xb6,0xf4] + vfmaddsub231ph ymm30, ymm29, ymm28 + +// CHECK: vfmaddsub231ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xb6,0xf4] + vfmaddsub231ph xmm30, xmm29, xmm28 + +// CHECK: vfmaddsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xb6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmaddsub231ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xb6,0x31] + vfmaddsub231ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmaddsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xb6,0x71,0x7f] + vfmaddsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmaddsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xb6,0x72,0x80] + vfmaddsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmaddsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xb6,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmaddsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmaddsub231ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xb6,0x31] + vfmaddsub231ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmaddsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xb6,0x71,0x7f] + vfmaddsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmaddsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xb6,0x72,0x80] + vfmaddsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfmsub132ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0x9a,0xf4] + vfmsub132ph ymm30, ymm29, ymm28 + +// CHECK: vfmsub132ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9a,0xf4] + vfmsub132ph xmm30, xmm29, xmm28 + +// CHECK: vfmsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub132ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0x9a,0x31] + vfmsub132ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0x9a,0x71,0x7f] + vfmsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x9a,0x72,0x80] + vfmsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub132ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0x9a,0x31] + vfmsub132ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9a,0x71,0x7f] + vfmsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0x9a,0x72,0x80] + vfmsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfmsub213ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xaa,0xf4] + vfmsub213ph ymm30, ymm29, ymm28 + +// CHECK: vfmsub213ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xaa,0xf4] + vfmsub213ph xmm30, xmm29, xmm28 + +// CHECK: vfmsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub213ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xaa,0x31] + vfmsub213ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xaa,0x71,0x7f] + vfmsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xaa,0x72,0x80] + vfmsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xaa,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub213ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xaa,0x31] + vfmsub213ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xaa,0x71,0x7f] + vfmsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xaa,0x72,0x80] + vfmsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfmsub231ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xba,0xf4] + vfmsub231ph ymm30, ymm29, ymm28 + +// CHECK: vfmsub231ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xba,0xf4] + vfmsub231ph xmm30, xmm29, xmm28 + +// CHECK: vfmsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub231ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xba,0x31] + vfmsub231ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xba,0x71,0x7f] + vfmsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xba,0x72,0x80] + vfmsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xba,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsub231ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xba,0x31] + vfmsub231ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xba,0x71,0x7f] + vfmsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xba,0x72,0x80] + vfmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfmsubadd132ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0x97,0xf4] + vfmsubadd132ph ymm30, ymm29, ymm28 + +// CHECK: vfmsubadd132ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x97,0xf4] + vfmsubadd132ph xmm30, xmm29, xmm28 + +// CHECK: vfmsubadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0x97,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsubadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsubadd132ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0x97,0x31] + vfmsubadd132ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmsubadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0x97,0x71,0x7f] + vfmsubadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmsubadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x97,0x72,0x80] + vfmsubadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmsubadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x97,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsubadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsubadd132ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0x97,0x31] + vfmsubadd132ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmsubadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x97,0x71,0x7f] + vfmsubadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmsubadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0x97,0x72,0x80] + vfmsubadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfmsubadd213ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xa7,0xf4] + vfmsubadd213ph ymm30, ymm29, ymm28 + +// CHECK: vfmsubadd213ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xa7,0xf4] + vfmsubadd213ph xmm30, xmm29, xmm28 + +// CHECK: vfmsubadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xa7,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsubadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsubadd213ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xa7,0x31] + vfmsubadd213ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmsubadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xa7,0x71,0x7f] + vfmsubadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmsubadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xa7,0x72,0x80] + vfmsubadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmsubadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xa7,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsubadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsubadd213ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xa7,0x31] + vfmsubadd213ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmsubadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xa7,0x71,0x7f] + vfmsubadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmsubadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xa7,0x72,0x80] + vfmsubadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfmsubadd231ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xb7,0xf4] + vfmsubadd231ph ymm30, ymm29, ymm28 + +// CHECK: vfmsubadd231ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xb7,0xf4] + vfmsubadd231ph xmm30, xmm29, xmm28 + +// CHECK: vfmsubadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xb7,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsubadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsubadd231ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xb7,0x31] + vfmsubadd231ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfmsubadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xb7,0x71,0x7f] + vfmsubadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfmsubadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xb7,0x72,0x80] + vfmsubadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfmsubadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xb7,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfmsubadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfmsubadd231ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xb7,0x31] + vfmsubadd231ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfmsubadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xb7,0x71,0x7f] + vfmsubadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfmsubadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xb7,0x72,0x80] + vfmsubadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmadd132ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0x9c,0xf4] + vfnmadd132ph ymm30, ymm29, ymm28 + +// CHECK: vfnmadd132ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9c,0xf4] + vfnmadd132ph xmm30, xmm29, xmm28 + +// CHECK: vfnmadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd132ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0x9c,0x31] + vfnmadd132ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfnmadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0x9c,0x71,0x7f] + vfnmadd132ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfnmadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x9c,0x72,0x80] + vfnmadd132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd132ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0x9c,0x31] + vfnmadd132ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfnmadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9c,0x71,0x7f] + vfnmadd132ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfnmadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0x9c,0x72,0x80] + vfnmadd132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmadd213ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xac,0xf4] + vfnmadd213ph ymm30, ymm29, ymm28 + +// CHECK: vfnmadd213ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xac,0xf4] + vfnmadd213ph xmm30, xmm29, xmm28 + +// CHECK: vfnmadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd213ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xac,0x31] + vfnmadd213ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfnmadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xac,0x71,0x7f] + vfnmadd213ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfnmadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xac,0x72,0x80] + vfnmadd213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xac,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd213ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xac,0x31] + vfnmadd213ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfnmadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xac,0x71,0x7f] + vfnmadd213ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfnmadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xac,0x72,0x80] + vfnmadd213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmadd231ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xbc,0xf4] + vfnmadd231ph ymm30, ymm29, ymm28 + +// CHECK: vfnmadd231ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xbc,0xf4] + vfnmadd231ph xmm30, xmm29, xmm28 + +// CHECK: vfnmadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd231ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xbc,0x31] + vfnmadd231ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfnmadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xbc,0x71,0x7f] + vfnmadd231ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfnmadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xbc,0x72,0x80] + vfnmadd231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xbc,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmadd231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmadd231ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xbc,0x31] + vfnmadd231ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfnmadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xbc,0x71,0x7f] + vfnmadd231ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfnmadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xbc,0x72,0x80] + vfnmadd231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmsub132ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0x9e,0xf4] + vfnmsub132ph ymm30, ymm29, ymm28 + +// CHECK: vfnmsub132ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x9e,0xf4] + vfnmsub132ph xmm30, xmm29, xmm28 + +// CHECK: vfnmsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub132ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub132ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0x9e,0x31] + vfnmsub132ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfnmsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0x9e,0x71,0x7f] + vfnmsub132ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfnmsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x9e,0x72,0x80] + vfnmsub132ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x9e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub132ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub132ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0x9e,0x31] + vfnmsub132ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfnmsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x9e,0x71,0x7f] + vfnmsub132ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfnmsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0x9e,0x72,0x80] + vfnmsub132ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmsub213ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xae,0xf4] + vfnmsub213ph ymm30, ymm29, ymm28 + +// CHECK: vfnmsub213ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xae,0xf4] + vfnmsub213ph xmm30, xmm29, xmm28 + +// CHECK: vfnmsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub213ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub213ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xae,0x31] + vfnmsub213ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfnmsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xae,0x71,0x7f] + vfnmsub213ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfnmsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xae,0x72,0x80] + vfnmsub213ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xae,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub213ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub213ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xae,0x31] + vfnmsub213ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfnmsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xae,0x71,0x7f] + vfnmsub213ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfnmsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xae,0x72,0x80] + vfnmsub213ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vfnmsub231ph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0xbe,0xf4] + vfnmsub231ph ymm30, ymm29, ymm28 + +// CHECK: vfnmsub231ph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0xbe,0xf4] + vfnmsub231ph xmm30, xmm29, xmm28 + +// CHECK: vfnmsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub231ph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub231ph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0xbe,0x31] + vfnmsub231ph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vfnmsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0xbe,0x71,0x7f] + vfnmsub231ph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vfnmsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0xbe,0x72,0x80] + vfnmsub231ph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vfnmsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0xbe,0xb4,0xf5,0x00,0x00,0x00,0x10] + vfnmsub231ph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vfnmsub231ph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0xbe,0x31] + vfnmsub231ph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vfnmsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0xbe,0x71,0x7f] + vfnmsub231ph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vfnmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0xbe,0x72,0x80] + vfnmsub231ph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}