diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -645,6 +645,9 @@ * Support intrinsic of ``_aand_i32/64`` * Support intrinsic of ``_aor_i32/64`` * Support intrinsic of ``_axor_i32/64`` +- Support ISA of ``AVX-IFMA``. + * Support intrinsic of ``_mm(256)_madd52hi_avx_epu64``. + * Support intrinsic of ``_mm(256)_madd52lo_avx_epu64``. WebAssembly Support in Clang ---------------------------- diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1328,10 +1328,10 @@ TARGET_BUILTIN(__builtin_ia32_movdqa64store256_mask, "vV4Oi*V4OiUc", "nV:256:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512ifma") TARGET_BUILTIN(__builtin_ia32_vpmadd52luq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512ifma") -TARGET_BUILTIN(__builtin_ia32_vpmadd52huq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl") -TARGET_BUILTIN(__builtin_ia32_vpmadd52huq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl") -TARGET_BUILTIN(__builtin_ia32_vpmadd52luq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl") -TARGET_BUILTIN(__builtin_ia32_vpmadd52luq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vpmadd52huq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl|avxifma") +TARGET_BUILTIN(__builtin_ia32_vpmadd52huq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl|avxifma") +TARGET_BUILTIN(__builtin_ia32_vpmadd52luq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl|avxifma") +TARGET_BUILTIN(__builtin_ia32_vpmadd52luq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl|avxifma") TARGET_BUILTIN(__builtin_ia32_vcomisd, "iV2dV2dIiIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_vcomiss, "iV4fV4fIiIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_kunpckdi, "UOiUOiUOi", "nc", "avx512bw") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4588,6 +4588,8 @@ def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group; def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group; def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group; +def mavxifma : Flag<["-"], "mavxifma">, Group; +def mno_avxifma : Flag<["-"], "mno-avxifma">, Group; def mavxvnni : Flag<["-"], "mavxvnni">, Group; def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group; def madx : Flag<["-"], "madx">, Group; diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -104,6 +104,7 @@ bool HasAVX512VL = false; bool HasAVX512VBMI = false; bool HasAVX512VBMI2 = false; + bool HasAVXIFMA = false; bool HasAVX512IFMA = false; bool HasAVX512VP2INTERSECT = false; bool HasSHA = false; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -338,6 +338,8 @@ HasCMPCCXADD = true; } else if (Feature == "+raoint") { HasRAOINT = true; + } else if (Feature == "+avxifma") { + HasAVXIFMA = true; } else if (Feature == "+avxvnni") { HasAVXVNNI = true; } else if (Feature == "+serialize") { @@ -790,6 +792,8 @@ Builder.defineMacro("__CMPCCXADD__"); if (HasRAOINT) Builder.defineMacro("__RAOINT__"); + if (HasAVXIFMA) + Builder.defineMacro("__AVXIFMA__"); if (HasAVXVNNI) Builder.defineMacro("__AVXVNNI__"); if (HasSERIALIZE) @@ -914,6 +918,7 @@ .Case("avx512vbmi2", true) .Case("avx512ifma", true) .Case("avx512vp2intersect", true) + .Case("avxifma", true) .Case("avxvnni", true) .Case("bmi", true) .Case("bmi2", true) @@ -994,7 +999,6 @@ .Case("amx-fp16", HasAMXFP16) .Case("amx-int8", HasAMXINT8) .Case("amx-tile", HasAMXTILE) - .Case("avxvnni", HasAVXVNNI) .Case("avx", SSELevel >= AVX) .Case("avx2", SSELevel >= AVX2) .Case("avx512f", SSELevel >= AVX512F) @@ -1013,6 +1017,8 @@ .Case("avx512vbmi2", HasAVX512VBMI2) .Case("avx512ifma", HasAVX512IFMA) .Case("avx512vp2intersect", HasAVX512VP2INTERSECT) + .Case("avxifma", HasAVXIFMA) + .Case("avxvnni", HasAVXVNNI) .Case("bmi", HasBMI) .Case("bmi2", HasBMI2) .Case("cldemote", HasCLDEMOTE) diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -142,6 +142,7 @@ avx512vp2intersectintrin.h avx512vpopcntdqintrin.h avx512vpopcntdqvlintrin.h + avxifmaintrin.h avxintrin.h avxvnniintrin.h bmi2intrin.h diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h --- a/clang/lib/Headers/avx512ifmavlintrin.h +++ b/clang/lib/Headers/avx512ifmavlintrin.h @@ -18,14 +18,21 @@ #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128))) #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256))) +#define _mm_madd52hi_epu64(X, Y, Z) \ + ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y), \ + (__v2di)(Z))) +#define _mm256_madd52hi_epu64(X, Y, Z) \ + ((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y), \ + (__v4di)(Z))) -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z) -{ - return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y, - (__v2di) __Z); -} +#define _mm_madd52lo_epu64(X, Y, Z) \ + ((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y), \ + (__v2di)(Z))) + +#define _mm256_madd52lo_epu64(X, Y, Z) \ + ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y), \ + (__v4di)(Z))) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) @@ -43,13 +50,6 @@ (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z) -{ - return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y, - (__v4di)__Z); -} - static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { @@ -66,13 +66,6 @@ (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z) -{ - return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y, - (__v2di)__Z); -} - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { @@ -89,13 +82,6 @@ (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z) -{ - return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y, - (__v4di)__Z); -} - static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/avxifmaintrin.h @@ -0,0 +1,177 @@ +/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVXIFMAINTRIN_H +#define __AVXIFMAINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ + __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ + __min_vector_width__(256))) + +// must vex-encoding + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y +/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the corresponding +/// unsigned 64-bit integer in \a __X, and store the results in \a dst. +/// +/// \headerfile +/// +/// \code +/// __m128i +/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z) +/// \endcode +/// +/// This intrinsic corresponds to the \c VPMADD52HUQ instruction. +/// +/// \return +/// return __m128i dst. +/// \param __X +/// A 128-bit vector of [2 x i64] +/// \param __Y +/// A 128-bit vector of [2 x i64] +/// \param __Z +/// A 128-bit vector of [2 x i64] +/// +/// \code{.operation} +/// FOR j := 0 to 1 +/// i := j*64 +/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) +/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { + return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y, + (__v2di)__Z); +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y +/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the corresponding +/// unsigned 64-bit integer in \a __X, and store the results in \a dst. +/// +/// \headerfile +/// +/// \code +/// __m256i +/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z) +/// \endcode +/// +/// This intrinsic corresponds to the \c VPMADD52HUQ instruction. +/// +/// \return +/// return __m256i dst. +/// \param __X +/// A 256-bit vector of [4 x i64] +/// \param __Y +/// A 256-bit vector of [4 x i64] +/// \param __Z +/// A 256-bit vector of [4 x i64] +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// i := j*64 +/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) +/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) { + return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y, + (__v4di)__Z); +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y +/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the corresponding +/// unsigned 64-bit integer in \a __X, and store the results in \a dst. +/// +/// \headerfile +/// +/// \code +/// __m128i +/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z) +/// \endcode +/// +/// This intrinsic corresponds to the \c VPMADD52LUQ instruction. +/// +/// \return +/// return __m128i dst. +/// \param __X +/// A 128-bit vector of [2 x i64] +/// \param __Y +/// A 128-bit vector of [2 x i64] +/// \param __Z +/// A 128-bit vector of [2 x i64] +/// +/// \code{.operation} +/// FOR j := 0 to 1 +/// i := j*64 +/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) +/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { + return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y, + (__v2di)__Z); +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y +/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the corresponding +/// unsigned 64-bit integer in \a __X, and store the results in \a dst. +/// +/// \headerfile +/// +/// \code +/// __m256i +/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z) +/// \endcode +/// +/// This intrinsic corresponds to the \c VPMADD52LUQ instruction. +/// +/// \return +/// return __m256i dst. +/// \param __X +/// A 256-bit vector of [4 x i64] +/// \param __Y +/// A 256-bit vector of [4 x i64] +/// \param __Z +/// A 256-bit vector of [4 x i64] +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// i := j*64 +/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) +/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) { + return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y, + (__v4di)__Z); +} +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif // __AVXIFMAINTRIN_H diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h --- a/clang/lib/Headers/cpuid.h +++ b/clang/lib/Headers/cpuid.h @@ -206,6 +206,7 @@ #define bit_CMPCCXADD 0x00000080 #define bit_AMXFP16 0x00200000 #define bit_HRESET 0x00400000 +#define bit_AVXIFMA 0x00800000 /* Features in %edx for leaf 7 sub-leaf 1 */ #define bit_PREFETCHI 0x00004000 diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -189,6 +189,11 @@ #include #endif +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVXIFMA__) +#include +#endif + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__AVX512VBMI__) #include diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c --- a/clang/test/CodeGen/attr-target-x86.c +++ b/clang/test/CodeGen/attr-target-x86.c @@ -54,9 +54,9 @@ // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686" // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-NOT: tune-cpu -// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" +// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686" -// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" +// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes" // CHECK-NOT: tune-cpu // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx" diff --git a/clang/test/CodeGen/avxifma-builtins.c b/clang/test/CodeGen/avxifma-builtins.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/avxifma-builtins.c @@ -0,0 +1,52 @@ +// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror | FileCheck %s + +#include + +__m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) { +// CHECK-LABEL: @test_mm_madd52hi_epu64 +// CHECK: call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128 + return _mm_madd52hi_epu64(__X, __Y, __Z); +} + +__m256i test_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) { +// CHECK-LABEL: @test_mm256_madd52hi_epu64 +// CHECK: call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256 + return _mm256_madd52hi_epu64(__X, __Y, __Z); +} + +__m128i test_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) { +// CHECK-LABEL: @test_mm_madd52lo_epu64 +// CHECK: call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128 + return _mm_madd52lo_epu64(__X, __Y, __Z); +} + +__m256i test_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) { +// CHECK-LABEL: @test_mm256_madd52lo_epu64 +// CHECK: call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256 + return _mm256_madd52lo_epu64(__X, __Y, __Z); +} + +__m128i test_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { +// CHECK-LABEL: @test_mm_madd52hi_avx_epu64 +// CHECK: call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128 + return _mm_madd52hi_avx_epu64(__X, __Y, __Z); +} + +__m256i test_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) { +// CHECK-LABEL: @test_mm256_madd52hi_avx_epu64 +// CHECK: call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256 + return _mm256_madd52hi_avx_epu64(__X, __Y, __Z); +} + +__m128i test_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { +// CHECK-LABEL: @test_mm_madd52lo_avx_epu64 +// CHECK: call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128 + return _mm_madd52lo_avx_epu64(__X, __Y, __Z); +} + +__m256i test_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) { +// CHECK-LABEL: @test_mm256_madd52lo_avx_epu64 +// CHECK: call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256 + return _mm256_madd52lo_avx_epu64(__X, __Y, __Z); +} diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -327,6 +327,11 @@ // RAOINT: "-target-feature" "+raoint" // NO-RAOINT: "-target-feature" "-raoint" +// RUN: %clang -target i386-linux-gnu -mavxifma %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVXIFMA %s +// RUN: %clang -target i386-linux-gnu -mno-avxifma %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVXIFMA %s +// AVXIFMA: "-target-feature" "+avxifma" +// NO-AVXIFMA: "-target-feature" "-avxifma" + // RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s // RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s // CRC32: "-target-feature" "+crc32" diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -602,6 +602,15 @@ // RUN: %clang -target x86_64-unknown-linux-gnu -march=atom -mno-cmpccxadd -x c -E -dM -o - %s | FileCheck -check-prefix=NO-CMPCCXADD %s // NO-CMPCCXADD-NOT: #define __CMPCCXADD__ 1 +// RUN: %clang -target i386-unknown-unknown -march=atom -mavxifma -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXIFMA %s + +// AVXIFMA: #define __AVX2__ 1 +// AVXIFMA: #define __AVXIFMA__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mavxifma -mno-avx2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXIFMANOAVX2 %s + +// AVXIFMANOAVX2-NOT: #define __AVX2__ 1 +// AVXIFMANOAVX2-NOT: #define __AVXIFMA__ 1 // RUN: %clang -target i386-unknown-linux-gnu -march=atom -mraoint -x c -E -dM -o - %s | FileCheck -check-prefix=RAOINT %s diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -135,6 +135,7 @@ Changes to the X86 Backend -------------------------- +* Support ISA of ``AVX-IFMA``. * Add support for the ``RDMSRLIST and WRMSRLIST`` instructions. * Add support for the ``WRMSRNS`` instruction. diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def --- a/llvm/include/llvm/Support/X86TargetParser.def +++ b/llvm/include/llvm/Support/X86TargetParser.def @@ -206,6 +206,7 @@ X86_FEATURE (AMX_FP16, "amx-fp16") X86_FEATURE (CMPCCXADD, "cmpccxadd") X86_FEATURE (AVXVNNI, "avxvnni") +X86_FEATURE (AVXIFMA, "avxifma") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -1811,6 +1811,7 @@ Features["amx-fp16"] = HasLeaf7Subleaf1 && ((EAX >> 21) & 1) && HasAMXSave; Features["cmpccxadd"] = HasLeaf7Subleaf1 && ((EAX >> 7) & 1); Features["hreset"] = HasLeaf7Subleaf1 && ((EAX >> 22) & 1); + Features["avxifma"] = HasLeaf7Subleaf1 && ((EAX >> 23) & 1) && HasAVXSave; Features["prefetchi"] = HasLeaf7Subleaf1 && ((EDX >> 14) & 1); bool HasLeafD = MaxLevel >= 0xd && diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp --- a/llvm/lib/Support/X86TargetParser.cpp +++ b/llvm/lib/Support/X86TargetParser.cpp @@ -585,6 +585,7 @@ constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {}; constexpr FeatureBitset ImpliedFeaturesCMPCCXADD = {}; constexpr FeatureBitset ImpliedFeaturesRAOINT = {}; +constexpr FeatureBitset ImpliedFeaturesAVXIFMA = FeatureAVX2; constexpr FeatureBitset ImpliedFeaturesAVX512FP16 = FeatureAVX512BW | FeatureAVX512DQ | FeatureAVX512VL; // Key Locker Features diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -155,6 +155,9 @@ def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true", "Enable AVX-512 further Vector Byte Manipulation Instructions", [FeatureBWI]>; +def FeatureAVXIFMA : SubtargetFeature<"avxifma", "HasAVXIFMA", "true", + "Enable AVX-IFMA", + [FeatureAVX2]>; def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true", "Enable AVX-512 Integer Fused Multiple-Add", [FeatureAVX512]>; diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -4103,12 +4103,16 @@ { X86::VPLZCNTQZ128rrk, X86::VPLZCNTQZ128rmk, 0 }, { X86::VPLZCNTQZ256rrk, X86::VPLZCNTQZ256rmk, 0 }, { X86::VPLZCNTQZrrk, X86::VPLZCNTQZrmk, 0 }, + { X86::VPMADD52HUQYrr, X86::VPMADD52HUQYrm, 0 }, { X86::VPMADD52HUQZ128r, X86::VPMADD52HUQZ128m, 0 }, { X86::VPMADD52HUQZ256r, X86::VPMADD52HUQZ256m, 0 }, { X86::VPMADD52HUQZr, X86::VPMADD52HUQZm, 0 }, + { X86::VPMADD52HUQrr, X86::VPMADD52HUQrm, 0 }, + { X86::VPMADD52LUQYrr, X86::VPMADD52LUQYrm, 0 }, { X86::VPMADD52LUQZ128r, X86::VPMADD52LUQZ128m, 0 }, { X86::VPMADD52LUQZ256r, X86::VPMADD52LUQZ256m, 0 }, { X86::VPMADD52LUQZr, X86::VPMADD52LUQZm, 0 }, + { X86::VPMADD52LUQrr, X86::VPMADD52LUQrm, 0 }, { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 }, { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 }, { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 }, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2573,6 +2573,8 @@ case X86::VPDPWSSDSZr: case X86::VPDPWSSDSZrk: case X86::VPDPWSSDSZrkz: + case X86::VPMADD52HUQrr: + case X86::VPMADD52HUQYrr: case X86::VPMADD52HUQZ128r: case X86::VPMADD52HUQZ128rk: case X86::VPMADD52HUQZ128rkz: @@ -2582,6 +2584,8 @@ case X86::VPMADD52HUQZr: case X86::VPMADD52HUQZrk: case X86::VPMADD52HUQZrkz: + case X86::VPMADD52LUQrr: + case X86::VPMADD52LUQYrr: case X86::VPMADD52LUQZ128r: case X86::VPMADD52LUQZ128rk: case X86::VPMADD52LUQZ128rkz: diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -957,6 +957,8 @@ def HasVBMI : Predicate<"Subtarget->hasVBMI()">; def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">; def HasIFMA : Predicate<"Subtarget->hasIFMA()">; +def HasAVXIFMA : Predicate<"Subtarget->hasAVXIFMA()">; +def NoVLX_Or_NoIFMA : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasIFMA()">; def HasRTM : Predicate<"Subtarget->hasRTM()">; def HasADX : Predicate<"Subtarget->hasADX()">; def HasSHA : Predicate<"Subtarget->hasSHA()">; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8123,3 +8123,40 @@ X86GF2P8affineqb>, TAPD; } +let Predicates = [HasAVXIFMA, NoVLX_Or_NoIFMA], Constraints = "$src1 = $dst", + checkVEXPredicate = 1 in +multiclass avx_ifma_rm opc, string OpcodeStr, SDNode OpNode> { + // NOTE: The SDNode have the multiply operands first with the add last. + // This enables commuted load patterns to be autogenerated by tablegen. + let isCommutable = 1 in { + def rr : AVX8I, + VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; + } + def rm : AVX8I, + VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; + let isCommutable = 1 in { + def Yrr : AVX8I, + VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; + } + def Yrm : AVX8I, + VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; +} + +defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, VEX_W, ExplicitVEXPrefix; +defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, VEX_W, ExplicitVEXPrefix; diff --git a/llvm/test/CodeGen/X86/avx-ifma-intrinsics.ll b/llvm/test/CodeGen/X86/avx-ifma-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx-ifma-intrinsics.ll @@ -0,0 +1,69 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxifma --show-mc-encoding | FileCheck %s --check-prefix=AVXIFMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxifma --show-mc-encoding | FileCheck %s --check-prefix=AVXIFMA +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxifma,+avx512ifma,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=AVX512IFMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxifma,+avx512ifma,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=AVX512IFMA + +declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) + +define <2 x i64>@test_int_x86_avx_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; AVXIFMA-LABEL: test_int_x86_avx_vpmadd52h_uq_128: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: {vex} vpmadd52huq %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xb5,0xc2] +; AVXIFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512IFMA-LABEL: test_int_x86_avx_vpmadd52h_uq_128: +; AVX512IFMA: # %bb.0: +; AVX512IFMA-NEXT: {vex} vpmadd52huq %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xb5,0xc2] +; AVX512IFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + ret <2 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) + +define <4 x i64>@test_int_x86_avx_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; AVXIFMA-LABEL: test_int_x86_avx_vpmadd52h_uq_256: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: {vex} vpmadd52huq %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xb5,0xc2] +; AVXIFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512IFMA-LABEL: test_int_x86_avx_vpmadd52h_uq_256: +; AVX512IFMA: # %bb.0: +; AVX512IFMA-NEXT: {vex} vpmadd52huq %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xb5,0xc2] +; AVX512IFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + ret <4 x i64> %res +} + +declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) + +define <2 x i64>@test_int_x86_avx_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; AVXIFMA-LABEL: test_int_x86_avx_vpmadd52l_uq_128: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: {vex} vpmadd52luq %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xb4,0xc2] +; AVXIFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512IFMA-LABEL: test_int_x86_avx_vpmadd52l_uq_128: +; AVX512IFMA: # %bb.0: +; AVX512IFMA-NEXT: {vex} vpmadd52luq %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xb4,0xc2] +; AVX512IFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + ret <2 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) + +define <4 x i64>@test_int_x86_avx_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; AVXIFMA-LABEL: test_int_x86_avx_vpmadd52l_uq_256: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xb4,0xc2] +; AVXIFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512IFMA-LABEL: test_int_x86_avx_vpmadd52l_uq_256: +; AVX512IFMA: # %bb.0: +; AVX512IFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xb4,0xc2] +; AVX512IFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + ret <4 x i64> %res +} diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512ifma.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512ifma.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512ifma.ll @@ -0,0 +1,217 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl < %s | FileCheck %s + +declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>) +declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>) + +define <8 x i64> @stack_fold_vpmadd52huq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52huq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) + ret <8 x i64> %2 +} + +define <8 x i64> @stack_fold_vpmadd52huq_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52huq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1) + ret <8 x i64> %2 +} + +define <8 x i64> @stack_fold_vpmadd52huq_mask(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { +; CHECK-LABEL: stack_fold_vpmadd52huq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x i64>, ptr %a0 + %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %2, <8 x i64> %a1, <8 x i64> %a2) + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2 + ret <8 x i64> %5 +} + +define <8 x i64> @stack_fold_vpmadd52huq_mask_commuted(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { +; CHECK-LABEL: stack_fold_vpmadd52huq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x i64>, ptr %a0 + %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %2, <8 x i64> %a2, <8 x i64> %a1) + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2 + ret <8 x i64> %5 +} + +define <8 x i64> @stack_fold_vpmadd52huq_maskz(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) { +; CHECK-LABEL: stack_fold_vpmadd52huq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movzbl (%rdi), %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) + %3 = load i8, ptr %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer + ret <8 x i64> %5 +} + +define <8 x i64> @stack_fold_vpmadd52huq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) { +; CHECK-LABEL: stack_fold_vpmadd52huq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movzbl (%rdi), %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1) + %3 = load i8, ptr %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer + ret <8 x i64> %5 +} + +define <8 x i64> @stack_fold_vpmadd52luq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52luq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) + ret <8 x i64> %2 +} + +define <8 x i64> @stack_fold_vpmadd52luq_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52luq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1) + ret <8 x i64> %2 +} + +define <8 x i64> @stack_fold_vpmadd52luq_mask(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { +; CHECK-LABEL: stack_fold_vpmadd52luq_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x i64>, ptr %a0 + %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %2, <8 x i64> %a1, <8 x i64> %a2) + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2 + ret <8 x i64> %5 +} + +define <8 x i64> @stack_fold_vpmadd52luq_mask_commuted(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { +; CHECK-LABEL: stack_fold_vpmadd52luq_mask_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x i64>, ptr %a0 + %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %2, <8 x i64> %a2, <8 x i64> %a1) + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2 + ret <8 x i64> %5 +} + +define <8 x i64> @stack_fold_vpmadd52luq_maskz(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) { +; CHECK-LABEL: stack_fold_vpmadd52luq_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movzbl (%rdi), %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) + %3 = load i8, ptr %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer + ret <8 x i64> %5 +} + +define <8 x i64> @stack_fold_vpmadd52luq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) { +; CHECK-LABEL: stack_fold_vpmadd52luq_maskz_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movzbl (%rdi), %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1) + %3 = load i8, ptr %mask + %4 = bitcast i8 %3 to <8 x i1> + %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer + ret <8 x i64> %5 +} diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxifma.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxifma.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-folding-int-avxifma.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxifma < %s | FileCheck %s + +declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) +declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) + +define <2 x i64> @stack_fold_vpmadd52huq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52huq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) + ret <2 x i64> %2 +} + +define <2 x i64> @stack_fold_vpmadd52huq_commuted(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52huq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %a0, <2 x i64> %a2, <2 x i64> %a1) + ret <2 x i64> %2 +} + +define <4 x i64> @stack_fold_vpmadd52huq_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52huq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) + ret <4 x i64> %2 +} + +define <4 x i64> @stack_fold_vpmadd52huq_256_commuted(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52huq_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %a0, <4 x i64> %a2, <4 x i64> %a1) + ret <4 x i64> %2 +} + +define <2 x i64> @stack_fold_vpmadd52luq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52luq: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) + ret <2 x i64> %2 +} + +define <2 x i64> @stack_fold_vpmadd52luq_commuted(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52luq_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %a0, <2 x i64> %a2, <2 x i64> %a1) + ret <2 x i64> %2 +} + +define <4 x i64> @stack_fold_vpmadd52luq_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52luq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) + ret <4 x i64> %2 +} + +define <4 x i64> @stack_fold_vpmadd52luq_256_commuted(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { +; CHECK-LABEL: stack_fold_vpmadd52luq_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %a0, <4 x i64> %a2, <4 x i64> %a1) + ret <4 x i64> %2 +} diff --git a/llvm/test/MC/Disassembler/X86/avx-ifma-32.txt b/llvm/test/MC/Disassembler/X86/avx-ifma-32.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx-ifma-32.txt @@ -0,0 +1,115 @@ +# RUN: llvm-mc --disassemble %s -triple=i686 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: {vex} vpmadd52huq %ymm4, %ymm3, %ymm2 +# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymm4 +0xc4,0xe2,0xe5,0xb5,0xd4 + +# ATT: {vex} vpmadd52huq %xmm4, %xmm3, %xmm2 +# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmm4 +0xc4,0xe2,0xe1,0xb5,0xd4 + +# ATT: {vex} vpmadd52huq 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0xe5,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: {vex} vpmadd52huq 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0xe5,0xb5,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: {vex} vpmadd52huq (%eax), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0xe5,0xb5,0x10 + +# ATT: {vex} vpmadd52huq -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0xe5,0xb5,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: {vex} vpmadd52huq 4064(%ecx), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [ecx + 4064] +0xc4,0xe2,0xe5,0xb5,0x91,0xe0,0x0f,0x00,0x00 + +# ATT: {vex} vpmadd52huq -4096(%edx), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edx - 4096] +0xc4,0xe2,0xe5,0xb5,0x92,0x00,0xf0,0xff,0xff + +# ATT: {vex} vpmadd52huq 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0xe1,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: {vex} vpmadd52huq 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0xe1,0xb5,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: {vex} vpmadd52huq (%eax), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0xe1,0xb5,0x10 + +# ATT: {vex} vpmadd52huq -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0xe1,0xb5,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: {vex} vpmadd52huq 2032(%ecx), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [ecx + 2032] +0xc4,0xe2,0xe1,0xb5,0x91,0xf0,0x07,0x00,0x00 + +# ATT: {vex} vpmadd52huq -2048(%edx), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edx - 2048] +0xc4,0xe2,0xe1,0xb5,0x92,0x00,0xf8,0xff,0xff + +# ATT: {vex} vpmadd52luq %ymm4, %ymm3, %ymm2 +# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymm4 +0xc4,0xe2,0xe5,0xb4,0xd4 + +# ATT: {vex} vpmadd52luq %xmm4, %xmm3, %xmm2 +# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmm4 +0xc4,0xe2,0xe1,0xb4,0xd4 + +# ATT: {vex} vpmadd52luq 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0xe5,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: {vex} vpmadd52luq 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0xe5,0xb4,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: {vex} vpmadd52luq (%eax), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0xe5,0xb4,0x10 + +# ATT: {vex} vpmadd52luq -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0xe5,0xb4,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: {vex} vpmadd52luq 4064(%ecx), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [ecx + 4064] +0xc4,0xe2,0xe5,0xb4,0x91,0xe0,0x0f,0x00,0x00 + +# ATT: {vex} vpmadd52luq -4096(%edx), %ymm3, %ymm2 +# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edx - 4096] +0xc4,0xe2,0xe5,0xb4,0x92,0x00,0xf0,0xff,0xff + +# ATT: {vex} vpmadd52luq 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0xe1,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: {vex} vpmadd52luq 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0xe1,0xb4,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: {vex} vpmadd52luq (%eax), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0xe1,0xb4,0x10 + +# ATT: {vex} vpmadd52luq -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0xe1,0xb4,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: {vex} vpmadd52luq 2032(%ecx), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [ecx + 2032] +0xc4,0xe2,0xe1,0xb4,0x91,0xf0,0x07,0x00,0x00 + +# ATT: {vex} vpmadd52luq -2048(%edx), %xmm3, %xmm2 +# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edx - 2048] +0xc4,0xe2,0xe1,0xb4,0x92,0x00,0xf8,0xff,0xff + diff --git a/llvm/test/MC/Disassembler/X86/avx-ifma-64.txt b/llvm/test/MC/Disassembler/X86/avx-ifma-64.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx-ifma-64.txt @@ -0,0 +1,115 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: {vex} vpmadd52huq %ymm14, %ymm13, %ymm12 +# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymm14 +0xc4,0x42,0x95,0xb5,0xe6 + +# ATT: {vex} vpmadd52huq %xmm14, %xmm13, %xmm12 +# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmm14 +0xc4,0x42,0x91,0xb5,0xe6 + +# ATT: {vex} vpmadd52huq 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x95,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: {vex} vpmadd52huq 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x95,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: {vex} vpmadd52huq (%rip), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x95,0xb5,0x25,0x00,0x00,0x00,0x00 + +# ATT: {vex} vpmadd52huq -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x95,0xb5,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: {vex} vpmadd52huq 4064(%rcx), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rcx + 4064] +0xc4,0x62,0x95,0xb5,0xa1,0xe0,0x0f,0x00,0x00 + +# ATT: {vex} vpmadd52huq -4096(%rdx), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rdx - 4096] +0xc4,0x62,0x95,0xb5,0xa2,0x00,0xf0,0xff,0xff + +# ATT: {vex} vpmadd52huq 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x91,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: {vex} vpmadd52huq 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x91,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: {vex} vpmadd52huq (%rip), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x91,0xb5,0x25,0x00,0x00,0x00,0x00 + +# ATT: {vex} vpmadd52huq -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x91,0xb5,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: {vex} vpmadd52huq 2032(%rcx), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rcx + 2032] +0xc4,0x62,0x91,0xb5,0xa1,0xf0,0x07,0x00,0x00 + +# ATT: {vex} vpmadd52huq -2048(%rdx), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rdx - 2048] +0xc4,0x62,0x91,0xb5,0xa2,0x00,0xf8,0xff,0xff + +# ATT: {vex} vpmadd52luq %ymm14, %ymm13, %ymm12 +# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymm14 +0xc4,0x42,0x95,0xb4,0xe6 + +# ATT: {vex} vpmadd52luq %xmm14, %xmm13, %xmm12 +# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmm14 +0xc4,0x42,0x91,0xb4,0xe6 + +# ATT: {vex} vpmadd52luq 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x95,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: {vex} vpmadd52luq 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x95,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: {vex} vpmadd52luq (%rip), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x95,0xb4,0x25,0x00,0x00,0x00,0x00 + +# ATT: {vex} vpmadd52luq -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x95,0xb4,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: {vex} vpmadd52luq 4064(%rcx), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rcx + 4064] +0xc4,0x62,0x95,0xb4,0xa1,0xe0,0x0f,0x00,0x00 + +# ATT: {vex} vpmadd52luq -4096(%rdx), %ymm13, %ymm12 +# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rdx - 4096] +0xc4,0x62,0x95,0xb4,0xa2,0x00,0xf0,0xff,0xff + +# ATT: {vex} vpmadd52luq 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x91,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: {vex} vpmadd52luq 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x91,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: {vex} vpmadd52luq (%rip), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x91,0xb4,0x25,0x00,0x00,0x00,0x00 + +# ATT: {vex} vpmadd52luq -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x91,0xb4,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: {vex} vpmadd52luq 2032(%rcx), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rcx + 2032] +0xc4,0x62,0x91,0xb4,0xa1,0xf0,0x07,0x00,0x00 + +# ATT: {vex} vpmadd52luq -2048(%rdx), %xmm13, %xmm12 +# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rdx - 2048] +0xc4,0x62,0x91,0xb4,0xa2,0x00,0xf8,0xff,0xff + diff --git a/llvm/test/MC/X86/avx-ifma-att-32.s b/llvm/test/MC/X86/avx-ifma-att-32.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx-ifma-att-32.s @@ -0,0 +1,114 @@ +// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxifma --show-encoding %s | FileCheck %s + +// CHECK: {vex} vpmadd52huq %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0xd4] + {vex} vpmadd52huq %ymm4, %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52huq %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0xd4] + {vex} vpmadd52huq %xmm4, %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52huq 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10] + {vex} vpmadd52huq 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52huq 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x94,0x87,0x23,0x01,0x00,0x00] + {vex} vpmadd52huq 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52huq (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x10] + {vex} vpmadd52huq (%eax), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52huq -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x14,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpmadd52huq -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52huq 4064(%ecx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x91,0xe0,0x0f,0x00,0x00] + {vex} vpmadd52huq 4064(%ecx), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52huq -4096(%edx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x92,0x00,0xf0,0xff,0xff] + {vex} vpmadd52huq -4096(%edx), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52huq 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10] + {vex} vpmadd52huq 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52huq 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x94,0x87,0x23,0x01,0x00,0x00] + {vex} vpmadd52huq 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52huq (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x10] + {vex} vpmadd52huq (%eax), %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52huq -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x14,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpmadd52huq -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52huq 2032(%ecx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x91,0xf0,0x07,0x00,0x00] + {vex} vpmadd52huq 2032(%ecx), %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52huq -2048(%edx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x92,0x00,0xf8,0xff,0xff] + {vex} vpmadd52huq -2048(%edx), %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52luq %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0xd4] + {vex} vpmadd52luq %ymm4, %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52luq %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0xd4] + {vex} vpmadd52luq %xmm4, %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52luq 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10] + {vex} vpmadd52luq 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52luq 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x94,0x87,0x23,0x01,0x00,0x00] + {vex} vpmadd52luq 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52luq (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x10] + {vex} vpmadd52luq (%eax), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52luq -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x14,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpmadd52luq -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52luq 4064(%ecx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x91,0xe0,0x0f,0x00,0x00] + {vex} vpmadd52luq 4064(%ecx), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52luq -4096(%edx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x92,0x00,0xf0,0xff,0xff] + {vex} vpmadd52luq -4096(%edx), %ymm3, %ymm2 + +// CHECK: {vex} vpmadd52luq 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10] + {vex} vpmadd52luq 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52luq 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x94,0x87,0x23,0x01,0x00,0x00] + {vex} vpmadd52luq 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52luq (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x10] + {vex} vpmadd52luq (%eax), %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52luq -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x14,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpmadd52luq -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52luq 2032(%ecx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x91,0xf0,0x07,0x00,0x00] + {vex} vpmadd52luq 2032(%ecx), %xmm3, %xmm2 + +// CHECK: {vex} vpmadd52luq -2048(%edx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x92,0x00,0xf8,0xff,0xff] + {vex} vpmadd52luq -2048(%edx), %xmm3, %xmm2 + diff --git a/llvm/test/MC/X86/avx-ifma-att-64.s b/llvm/test/MC/X86/avx-ifma-att-64.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx-ifma-att-64.s @@ -0,0 +1,114 @@ +// RUN: llvm-mc -triple=x86_64-unknown-unknown -mattr=+avxifma --show-encoding < %s | FileCheck %s + +// CHECK: {vex} vpmadd52huq %ymm14, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x95,0xb5,0xe6] + {vex} vpmadd52huq %ymm14, %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52huq %xmm14, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x91,0xb5,0xe6] + {vex} vpmadd52huq %xmm14, %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52huq 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x95,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpmadd52huq 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52huq 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x95,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00] + {vex} vpmadd52huq 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52huq (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0x25,0x00,0x00,0x00,0x00] + {vex} vpmadd52huq (%rip), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52huq -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0x24,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpmadd52huq -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52huq 4064(%rcx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0xa1,0xe0,0x0f,0x00,0x00] + {vex} vpmadd52huq 4064(%rcx), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52huq -4096(%rdx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0xa2,0x00,0xf0,0xff,0xff] + {vex} vpmadd52huq -4096(%rdx), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52huq 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x91,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpmadd52huq 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52huq 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x91,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00] + {vex} vpmadd52huq 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52huq (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0x25,0x00,0x00,0x00,0x00] + {vex} vpmadd52huq (%rip), %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52huq -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0x24,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpmadd52huq -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52huq 2032(%rcx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0xa1,0xf0,0x07,0x00,0x00] + {vex} vpmadd52huq 2032(%rcx), %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52huq -2048(%rdx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0xa2,0x00,0xf8,0xff,0xff] + {vex} vpmadd52huq -2048(%rdx), %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52luq %ymm14, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x95,0xb4,0xe6] + {vex} vpmadd52luq %ymm14, %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52luq %xmm14, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x91,0xb4,0xe6] + {vex} vpmadd52luq %xmm14, %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52luq 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x95,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpmadd52luq 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52luq 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x95,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00] + {vex} vpmadd52luq 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52luq (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0x25,0x00,0x00,0x00,0x00] + {vex} vpmadd52luq (%rip), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52luq -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0x24,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpmadd52luq -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52luq 4064(%rcx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0xa1,0xe0,0x0f,0x00,0x00] + {vex} vpmadd52luq 4064(%rcx), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52luq -4096(%rdx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0xa2,0x00,0xf0,0xff,0xff] + {vex} vpmadd52luq -4096(%rdx), %ymm13, %ymm12 + +// CHECK: {vex} vpmadd52luq 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x91,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpmadd52luq 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52luq 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x91,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00] + {vex} vpmadd52luq 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52luq (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0x25,0x00,0x00,0x00,0x00] + {vex} vpmadd52luq (%rip), %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52luq -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0x24,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpmadd52luq -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52luq 2032(%rcx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0xa1,0xf0,0x07,0x00,0x00] + {vex} vpmadd52luq 2032(%rcx), %xmm13, %xmm12 + +// CHECK: {vex} vpmadd52luq -2048(%rdx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0xa2,0x00,0xf8,0xff,0xff] + {vex} vpmadd52luq -2048(%rdx), %xmm13, %xmm12 + diff --git a/llvm/test/MC/X86/avx-ifma-intel-32.s b/llvm/test/MC/X86/avx-ifma-intel-32.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx-ifma-intel-32.s @@ -0,0 +1,114 @@ +// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxifma -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0xd4] + {vex} vpmadd52huq ymm2, ymm3, ymm4 + +// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0xd4] + {vex} vpmadd52huq xmm2, xmm3, xmm4 + +// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10] + {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x94,0x87,0x23,0x01,0x00,0x00] + {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x10] + {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [eax] + +// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x14,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x91,0xe0,0x0f,0x00,0x00] + {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x92,0x00,0xf0,0xff,0xff] + {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edx - 4096] + +// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10] + {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x94,0x87,0x23,0x01,0x00,0x00] + {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x10] + {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [eax] + +// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x14,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x91,0xf0,0x07,0x00,0x00] + {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x92,0x00,0xf8,0xff,0xff] + {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edx - 2048] + +// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0xd4] + {vex} vpmadd52luq ymm2, ymm3, ymm4 + +// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0xd4] + {vex} vpmadd52luq xmm2, xmm3, xmm4 + +// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10] + {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x94,0x87,0x23,0x01,0x00,0x00] + {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x10] + {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [eax] + +// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x14,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x91,0xe0,0x0f,0x00,0x00] + {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x92,0x00,0xf0,0xff,0xff] + {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edx - 4096] + +// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10] + {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x94,0x87,0x23,0x01,0x00,0x00] + {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x10] + {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [eax] + +// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x14,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x91,0xf0,0x07,0x00,0x00] + {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x92,0x00,0xf8,0xff,0xff] + {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edx - 2048] + diff --git a/llvm/test/MC/X86/avx-ifma-intel-64.s b/llvm/test/MC/X86/avx-ifma-intel-64.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx-ifma-intel-64.s @@ -0,0 +1,114 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxifma -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymm14 +// CHECK: encoding: [0xc4,0x42,0x95,0xb5,0xe6] + {vex} vpmadd52huq ymm12, ymm13, ymm14 + +// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmm14 +// CHECK: encoding: [0xc4,0x42,0x91,0xb5,0xe6] + {vex} vpmadd52huq xmm12, xmm13, xmm14 + +// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x95,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x95,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00] + {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0x25,0x00,0x00,0x00,0x00] + {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rip] + +// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0x24,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0xa1,0xe0,0x0f,0x00,0x00] + {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rcx + 4064] + +// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0xa2,0x00,0xf0,0xff,0xff] + {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rdx - 4096] + +// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x91,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x91,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00] + {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0x25,0x00,0x00,0x00,0x00] + {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rip] + +// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0x24,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0xa1,0xf0,0x07,0x00,0x00] + {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rcx + 2032] + +// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0xa2,0x00,0xf8,0xff,0xff] + {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rdx - 2048] + +// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymm14 +// CHECK: encoding: [0xc4,0x42,0x95,0xb4,0xe6] + {vex} vpmadd52luq ymm12, ymm13, ymm14 + +// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmm14 +// CHECK: encoding: [0xc4,0x42,0x91,0xb4,0xe6] + {vex} vpmadd52luq xmm12, xmm13, xmm14 + +// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x95,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x95,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00] + {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0x25,0x00,0x00,0x00,0x00] + {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rip] + +// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0x24,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0xa1,0xe0,0x0f,0x00,0x00] + {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rcx + 4064] + +// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0xa2,0x00,0xf0,0xff,0xff] + {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rdx - 4096] + +// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x91,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x91,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00] + {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0x25,0x00,0x00,0x00,0x00] + {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rip] + +// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0x24,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0xa1,0xf0,0x07,0x00,0x00] + {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rcx + 2032] + +// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0xa2,0x00,0xf8,0xff,0xff] + {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rdx - 2048] +