diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -614,6 +614,10 @@ - Support ISA of ``CMPCCXADD``. * Support intrinsic of ``__cmpccxadd_epi32``. * Support intrinsic of ``__cmpccxadd_epi64``. +- Support ISA of ``AVX-VNNI-INT8``. + * Support intrinsic of ``_mm(256)_dpbssd(s)_epi32``. + * Support intrinsic of ``_mm(256)_dpbsud(s)_epi32``. + * Support intrinsic of ``_mm(256)_dpbuud(s)_epi32``. WebAssembly Support in Clang ---------------------------- diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -906,6 +906,7 @@ TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f") +// AVX-VNNI and AVX512-VNNI TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni") TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni") TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni") @@ -919,6 +920,20 @@ TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni") TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni") +// AVX-VNNI-INT8 +TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8") +TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8") + TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2OiV2OivC*V2OiUcIi", "nV:128:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_gather3div4df, "V4dV4dvC*V4OiUcIi", "nV:256:", "avx512vl") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4588,6 +4588,8 @@ def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group; def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group; def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group; +def mavxvnniint8 : Flag<["-"], "mavxvnniint8">, Group; +def mno_avxvnniint8 : Flag<["-"], "mno-avxvnniint8">, Group; def mavxvnni : Flag<["-"], "mavxvnni">, Group; def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group; def madx : Flag<["-"], "madx">, Group; diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -139,6 +139,7 @@ bool HasENQCMD = false; bool HasAMXFP16 = false; bool HasCMPCCXADD = false; + bool HasAVXVNNIINT8 = false; bool HasKL = false; // For key locker bool HasWIDEKL = false; // For wide key locker bool HasHRESET = false; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -338,6 +338,8 @@ HasCMPCCXADD = true; } else if (Feature == "+avxvnni") { HasAVXVNNI = true; + } else if (Feature == "+avxvnniint8") { + HasAVXVNNIINT8 = true; } else if (Feature == "+serialize") { HasSERIALIZE = true; } else if (Feature == "+tsxldtrk") { @@ -788,6 +790,8 @@ Builder.defineMacro("__CMPCCXADD__"); if (HasAVXVNNI) Builder.defineMacro("__AVXVNNI__"); + if (HasAVXVNNIINT8) + Builder.defineMacro("__AVXVNNIINT8__"); if (HasSERIALIZE) Builder.defineMacro("__SERIALIZE__"); if (HasTSXLDTRK) @@ -911,6 +915,7 @@ .Case("avx512ifma", true) .Case("avx512vp2intersect", true) .Case("avxvnni", true) + .Case("avxvnniint8", true) .Case("bmi", true) .Case("bmi2", true) .Case("cldemote", true) @@ -989,7 +994,6 @@ .Case("amx-fp16", HasAMXFP16) .Case("amx-int8", HasAMXINT8) .Case("amx-tile", HasAMXTILE) - .Case("avxvnni", HasAVXVNNI) .Case("avx", SSELevel >= AVX) .Case("avx2", SSELevel >= AVX2) .Case("avx512f", SSELevel >= AVX512F) @@ -1008,6 +1012,8 @@ .Case("avx512vbmi2", HasAVX512VBMI2) .Case("avx512ifma", HasAVX512IFMA) .Case("avx512vp2intersect", HasAVX512VP2INTERSECT) + .Case("avxvnni", HasAVXVNNI) + .Case("avxvnniint8", HasAVXVNNIINT8) .Case("bmi", HasBMI) .Case("bmi2", HasBMI2) .Case("cldemote", HasCLDEMOTE) diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -143,6 +143,7 @@ avx512vpopcntdqintrin.h avx512vpopcntdqvlintrin.h avxintrin.h + avxvnniint8intrin.h avxvnniintrin.h bmi2intrin.h bmiintrin.h diff --git a/clang/lib/Headers/avxvnniint8intrin.h b/clang/lib/Headers/avxvnniint8intrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/avxvnniint8intrin.h @@ -0,0 +1,471 @@ +/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifndef __AVXVNNIINT8INTRIN_H +#define __AVXVNNIINT8INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \ + __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \ + __min_vector_width__(128))) + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// +/// \param __A +/// A 128-bit vector of [16 x char]. +/// \param __B +/// A 128-bit vector of [16 x char]. +/// \returns +/// A 128-bit vector of [4 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) +/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) +/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) +/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// +/// \param __A +/// A 256-bit vector of [32 x char]. +/// \param __B +/// A 256-bit vector of [32 x char]. +/// \returns +/// A 256-bit vector of [8 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) +/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) +/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) +/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// +/// \param __A +/// A 128-bit vector of [16 x char]. +/// \param __B +/// A 128-bit vector of [16 x char]. +/// \returns +/// A 128-bit vector of [4 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) +/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) +/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) +/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) +/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// +/// \param __A +/// A 256-bit vector of [32 x char]. +/// \param __B +/// A 256-bit vector of [32 x char]. +/// \returns +/// A 256-bit vector of [8 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) +/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) +/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) +/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) +/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with +/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// +/// \param __A +/// A 128-bit vector of [16 x char]. +/// \param __B +/// A 128-bit vector of [16 x unsigned char]. +/// \returns +/// A 128-bit vector of [4 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with +/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// +/// \param __A +/// A 256-bit vector of [32 x char]. +/// \param __B +/// A 256-bit vector of [32 x unsigned char]. +/// \returns +/// A 256-bit vector of [8 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with +/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// +/// \param __A +/// A 128-bit vector of [16 x char]. +/// \param __B +/// A 128-bit vector of [16 x unsigned char]. +/// \returns +/// A 128-bit vector of [4 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) +/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with +/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// +/// \param __A +/// A 256-bit vector of [32 x char]. +/// \param __B +/// A 256-bit vector of [32 x unsigned char]. +/// \returns +/// A 256-bit vector of [8 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) +/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// +/// \param __A +/// A 128-bit vector of [16 x unsigned char]. +/// \param __B +/// A 128-bit vector of [16 x unsigned char]. +/// \returns +/// A 128-bit vector of [4 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) +/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) +/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) +/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// +/// \param __A +/// A 256-bit vector of [32 x unsigned char]. +/// \param __B +/// A 256-bit vector of [32 x unsigned char]. +/// \returns +/// A 256-bit vector of [8 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) +/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) +/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) +/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBUUDS instruction. +/// +/// \param __A +/// A 128-bit vector of [16 x unsigned char]. +/// \param __B +/// A 128-bit vector of [16 x unsigned char]. +/// \returns +/// A 128-bit vector of [4 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) +/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) +/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) +/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) +/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with +/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate +/// signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPBUUDS instruction. +/// +/// \param __A +/// A 256-bit vector of [32 x unsigned char]. +/// \param __B +/// A 256-bit vector of [32 x unsigned char]. +/// \returns +/// A 256-bit vector of [8 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) +/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) +/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) +/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) +/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif // __AVXVNNIINT8INTRIN_H diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h --- a/clang/lib/Headers/cpuid.h +++ b/clang/lib/Headers/cpuid.h @@ -207,6 +207,7 @@ #define bit_HRESET 0x00400000 /* Features in %edx for leaf 7 sub-leaf 1 */ +#define bit_AVXVNNIINT8 0x00000010 #define bit_PREFETCHI 0x00004000 /* Features in %eax for leaf 13 sub-leaf 1 */ diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -254,6 +254,11 @@ #include #endif +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVXVNNIINT8__) +#include +#endif + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__RDPID__) /// Returns the value of the IA32_TSC_AUX MSR (0xc0000103). diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c --- a/clang/test/CodeGen/attr-target-x86.c +++ b/clang/test/CodeGen/attr-target-x86.c @@ -54,9 +54,9 @@ // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686" // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-NOT: tune-cpu -// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" +// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686" -// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" +// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes" // CHECK-NOT: tune-cpu // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx" diff --git a/clang/test/CodeGen/avxvnniint8-builtins.c b/clang/test/CodeGen/avxvnniint8-builtins.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/avxvnniint8-builtins.c @@ -0,0 +1,76 @@ +// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64- -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -ffreestanding %s -triple=i386- -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s + +#include + +// CHECK-LABEL: @test_mm_dpbssd_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbssd.128 +__m128i test_mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B) { + return _mm_dpbssd_epi32(__W, __A, __B); +} + +// CHECK-LABEL: @test_mm_dpbssds_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbssds.128 +__m128i test_mm_dpbssds_epi32(__m128i __W, __m128i __A, __m128i __B) { + return _mm_dpbssds_epi32(__W, __A, __B); +} + +// CHECK-LABEL: @test_mm_dpbsud_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbsud.128 +__m128i test_mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B) { + return _mm_dpbsud_epi32(__W, __A, __B); +} + +// CHECK-LABEL: @test_mm_dpbsuds_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128 +__m128i test_mm_dpbsuds_epi32(__m128i __W, __m128i __A, __m128i __B) { + return _mm_dpbsuds_epi32(__W, __A, __B); +} + +// CHECK-LABEL: @test_mm_dpbuud_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbuud.128 +__m128i test_mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B) { + return _mm_dpbuud_epi32(__W, __A, __B); +} + +// CHECK-LABEL: @test_mm_dpbuuds_epi32( +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128 +__m128i test_mm_dpbuuds_epi32(__m128i __W, __m128i __A, __m128i __B) { + return _mm_dpbuuds_epi32(__W, __A, __B); +} + +// CHECK-LABEL: @test_mm256_dpbssd_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbssd.256 +__m256i test_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) { + return _mm256_dpbssd_epi32(__W, __A, __B); +} + +// CHECK-LABEL: @test_mm256_dpbssds_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbssds.256 +__m256i test_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) { + return _mm256_dpbssds_epi32(__W, __A, __B); +} + +// CHECK-LABEL: @test_mm256_dpbsud_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbsud.256 +__m256i test_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) { + return _mm256_dpbsud_epi32(__W, __A, __B); +} + +// CHECK-LABEL: @test_mm256_dpbsuds_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256 +__m256i test_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { + return _mm256_dpbsuds_epi32(__W, __A, __B); +} + +// CHECK-LABEL: @test_mm256_dpbuud_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbuud.256 +__m256i test_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) { + return _mm256_dpbuud_epi32(__W, __A, __B); +} + +// CHECK-LABEL: @test_mm256_dpbuuds_epi32( +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256 +__m256i test_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { + return _mm256_dpbuuds_epi32(__W, __A, __B); +} diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -322,6 +322,11 @@ // CMPCCXADD: "-target-feature" "+cmpccxadd" // NO-CMPCCXADD: "-target-feature" "-cmpccxadd" +// RUN: %clang --target=i386 -mavxvnniint8 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX-VNNIINT8 %s +// RUN: %clang --target=i386 -mno-avxvnniint8 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX-VNNIINT8 %s +// AVX-VNNIINT8: "-target-feature" "+avxvnniint8" +// NO-AVX-VNNIINT8: "-target-feature" "-avxvnniint8" + // RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s // RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s // CRC32: "-target-feature" "+crc32" diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -603,6 +603,20 @@ // NO-CMPCCXADD-NOT: #define __CMPCCXADD__ 1 +// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnniint8 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNIINT8 %s + +// AVXVNNIINT8: #define __AVX2__ 1 +// AVXVNNIINT8: #define __AVXVNNIINT8__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mno-avxvnniint8 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOAVXVNNIINT8 %s + +// NOAVXVNNIINT8-NOT: #define __AVXVNNIINT8__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnniint8 -mno-avx2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNIINT8NOAVX2 %s + +// AVXVNNIINT8NOAVX2-NOT: #define __AVX2__ 1 +// AVXVNNIINT8NOAVX2-NOT: #define __AVXVNNIINT8__ 1 + // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mcrc32 -x c -E -dM -o - %s | FileCheck -check-prefix=CRC32 %s // CRC32: #define __CRC32__ 1 diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -140,6 +140,7 @@ * Add support for the ``WRMSRNS`` instruction. * Support ISA of ``AMX-FP16`` which contains ``tdpfp16ps`` instruction. * Support ISA of ``CMPCCXADD``. +* Support ISA of ``AVX-VNNI-INT8``. Changes to the OCaml bindings ----------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -1929,6 +1929,66 @@ ClangBuiltin<"__builtin_ia32_vpdpwssds512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; + def int_x86_avx2_vpdpbssd_128 + : ClangBuiltin<"__builtin_ia32_vpdpbssd128">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpbssd_256 + : ClangBuiltin<"__builtin_ia32_vpdpbssd256">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpbssds_128 + : ClangBuiltin<"__builtin_ia32_vpdpbssds128">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpbssds_256 + : ClangBuiltin<"__builtin_ia32_vpdpbssds256">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpbsud_128 + : ClangBuiltin<"__builtin_ia32_vpdpbsud128">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpbsud_256 + : ClangBuiltin<"__builtin_ia32_vpdpbsud256">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpbsuds_128 + : ClangBuiltin<"__builtin_ia32_vpdpbsuds128">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpbsuds_256 + : ClangBuiltin<"__builtin_ia32_vpdpbsuds256">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpbuud_128 + : ClangBuiltin<"__builtin_ia32_vpdpbuud128">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpbuud_256 + : ClangBuiltin<"__builtin_ia32_vpdpbuud256">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpbuuds_128 + : ClangBuiltin<"__builtin_ia32_vpdpbuuds128">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpbuuds_256 + : ClangBuiltin<"__builtin_ia32_vpdpbuuds256">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; } //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def --- a/llvm/include/llvm/Support/X86TargetParser.def +++ b/llvm/include/llvm/Support/X86TargetParser.def @@ -205,6 +205,7 @@ X86_FEATURE (AMX_FP16, "amx-fp16") X86_FEATURE (CMPCCXADD, "cmpccxadd") X86_FEATURE (AVXVNNI, "avxvnni") +X86_FEATURE (AVXVNNIINT8, "avxvnniint8") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -1810,6 +1810,7 @@ Features["amx-fp16"] = HasLeaf7Subleaf1 && ((EAX >> 21) & 1) && HasAMXSave; Features["cmpccxadd"] = HasLeaf7Subleaf1 && ((EAX >> 7) & 1); Features["hreset"] = HasLeaf7Subleaf1 && ((EAX >> 22) & 1); + Features["avxvnniint8"] = HasLeaf7Subleaf1 && ((EDX >> 4) & 1) && HasAVXSave; Features["prefetchi"] = HasLeaf7Subleaf1 && ((EDX >> 14) & 1); bool HasLeafD = MaxLevel >= 0xd && diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp --- a/llvm/lib/Support/X86TargetParser.cpp +++ b/llvm/lib/Support/X86TargetParser.cpp @@ -582,6 +582,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesHRESET = {}; +constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT8 = FeatureAVX2; constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {}; constexpr FeatureBitset ImpliedFeaturesCMPCCXADD = {}; constexpr FeatureBitset ImpliedFeaturesAVX512FP16 = diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -184,6 +184,10 @@ def FeatureFP16 : SubtargetFeature<"avx512fp16", "HasFP16", "true", "Support 16-bit floating point", [FeatureBWI, FeatureVLX, FeatureDQI]>; +def FeatureAVXVNNIINT8 : SubtargetFeature<"avxvnniint8", + "HasAVXVNNIINT8", "true", + "Enable AVX-VNNI-INT8", + [FeatureAVX2]>; def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -584,6 +584,13 @@ VFCMULCSH, VFCMULCSH_RND, + VPDPBSUD, + VPDPBSUDS, + VPDPBUUD, + VPDPBUUDS, + VPDPBSSD, + VPDPBSSDS, + // Compress and expand. COMPRESS, EXPAND, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34162,6 +34162,12 @@ NODE_NAME_CASE(ENQCMD) NODE_NAME_CASE(ENQCMDS) NODE_NAME_CASE(VP2INTERSECT) + NODE_NAME_CASE(VPDPBSUD) + NODE_NAME_CASE(VPDPBSUDS) + NODE_NAME_CASE(VPDPBUUD) + NODE_NAME_CASE(VPDPBUUDS) + NODE_NAME_CASE(VPDPBSSD) + NODE_NAME_CASE(VPDPBSSDS) NODE_NAME_CASE(AESENC128KL) NODE_NAME_CASE(AESDEC128KL) NODE_NAME_CASE(AESENC256KL) diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -3995,6 +3995,14 @@ { X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 }, { X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 }, { X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 }, + { X86::VPDPBSSDSYrr, X86::VPDPBSSDSYrm, 0 }, + { X86::VPDPBSSDSrr, X86::VPDPBSSDSrm, 0 }, + { X86::VPDPBSSDYrr, X86::VPDPBSSDYrm, 0 }, + { X86::VPDPBSSDrr, X86::VPDPBSSDrm, 0 }, + { X86::VPDPBSUDSYrr, X86::VPDPBSUDSYrm, 0 }, + { X86::VPDPBSUDSrr, X86::VPDPBSUDSrm, 0 }, + { X86::VPDPBSUDYrr, X86::VPDPBSUDYrm, 0 }, + { X86::VPDPBSUDrr, X86::VPDPBSUDrm, 0 }, { X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0 }, { X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0 }, { X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256m, 0 }, @@ -4005,6 +4013,10 @@ { X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256m, 0 }, { X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0 }, { X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0 }, + { X86::VPDPBUUDSYrr, X86::VPDPBUUDSYrm, 0 }, + { X86::VPDPBUUDSrr, X86::VPDPBUUDSrm, 0 }, + { X86::VPDPBUUDYrr, X86::VPDPBUUDYrm, 0 }, + { X86::VPDPBUUDrr, X86::VPDPBUUDrm, 0 }, { X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0 }, { X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0 }, { X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256m, 0 }, diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -813,6 +813,13 @@ SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2> ]>; +def X86vpdpbssd : SDNode<"X86ISD::VPDPBSSD", SDTVnni>; +def X86vpdpbssds : SDNode<"X86ISD::VPDPBSSDS", SDTVnni>; +def X86vpdpbsud : SDNode<"X86ISD::VPDPBSUD", SDTVnni>; +def X86vpdpbsuds : SDNode<"X86ISD::VPDPBSUDS", SDTVnni>; +def X86vpdpbuud : SDNode<"X86ISD::VPDPBUUD", SDTVnni>; +def X86vpdpbuuds : SDNode<"X86ISD::VPDPBUUDS", SDTVnni>; + //===----------------------------------------------------------------------===// // SSE pattern fragments //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2555,6 +2555,14 @@ case X86::VPDPWSSDrr: case X86::VPDPWSSDSYrr: case X86::VPDPWSSDSrr: + case X86::VPDPBSSDSrr: + case X86::VPDPBSSDSYrr: + case X86::VPDPBSSDrr: + case X86::VPDPBSSDYrr: + case X86::VPDPBUUDSrr: + case X86::VPDPBUUDSYrr: + case X86::VPDPBUUDrr: + case X86::VPDPBUUDYrr: case X86::VPDPWSSDZ128r: case X86::VPDPWSSDZ128rk: case X86::VPDPWSSDZ128rkz: diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -921,6 +921,7 @@ def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">; def HasBF16 : Predicate<"Subtarget->hasBF16()">; def HasFP16 : Predicate<"Subtarget->hasFP16()">; +def HasAVXVNNIINT8 : Predicate<"Subtarget->hasAVXVNNIINT8()">; def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">; def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8123,3 +8123,61 @@ X86GF2P8affineqb>, TAPD; } +let Constraints = "$src1 = $dst" in +multiclass avx_dotprod_rm Opc, string OpcodeStr, ValueType OpVT, + RegisterClass RC, PatFrag MemOpFrag, + X86MemOperand X86memop, SDNode OpNode, + X86FoldableSchedWrite Sched, + bit IsCommutable> { + let isCommutable = IsCommutable in + def rr : I, + VEX_4V, Sched<[Sched]>; + def rm : I, + VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>; +} + +let Predicates = [HasAVXVNNIINT8] in { + defm VPDPBSSD : avx_dotprod_rm<0x50,"vpdpbssd", v4i32, VR128, loadv4i32, + i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM, + 1>, T8XD; + defm VPDPBSSDY : avx_dotprod_rm<0x50,"vpdpbssd", v8i32, VR256, loadv8i32, + i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM, + 1>, VEX_L, T8XD; + defm VPDPBUUD : avx_dotprod_rm<0x50,"vpdpbuud", v4i32, VR128, loadv4i32, + i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM, + 1>, T8PS; + defm VPDPBUUDY : avx_dotprod_rm<0x50,"vpdpbuud", v8i32, VR256, loadv8i32, + i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM, + 1>, VEX_L, T8PS; + defm VPDPBSSDS : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32, + i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM, + 1>, T8XD; + defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32, + i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM, + 1>, VEX_L, T8XD; + defm VPDPBUUDS : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32, + i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM, + 1>, T8PS; + defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32, + i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM, + 1>, VEX_L, T8PS; + defm VPDPBSUD : avx_dotprod_rm<0x50,"vpdpbsud", v4i32, VR128, loadv4i32, + i128mem, X86vpdpbsud, SchedWriteVecIMul.XMM, + 0>, T8XS; + defm VPDPBSUDY : avx_dotprod_rm<0x50,"vpdpbsud", v8i32, VR256, loadv8i32, + i256mem, X86vpdpbsud, SchedWriteVecIMul.YMM, + 0>, VEX_L, T8XS; + defm VPDPBSUDS : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32, + i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM, + 0>, T8XS; + defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32, + i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM, + 0>, VEX_L, T8XS; +} diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -415,6 +415,18 @@ X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0), X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, X86ISD::VSRLV, 0), X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx2_vpdpbssd_128, INTR_TYPE_3OP, X86ISD::VPDPBSSD, 0), + X86_INTRINSIC_DATA(avx2_vpdpbssd_256, INTR_TYPE_3OP, X86ISD::VPDPBSSD, 0), + X86_INTRINSIC_DATA(avx2_vpdpbssds_128, INTR_TYPE_3OP, X86ISD::VPDPBSSDS, 0), + X86_INTRINSIC_DATA(avx2_vpdpbssds_256, INTR_TYPE_3OP, X86ISD::VPDPBSSDS, 0), + X86_INTRINSIC_DATA(avx2_vpdpbsud_128, INTR_TYPE_3OP, X86ISD::VPDPBSUD, 0), + X86_INTRINSIC_DATA(avx2_vpdpbsud_256, INTR_TYPE_3OP, X86ISD::VPDPBSUD, 0), + X86_INTRINSIC_DATA(avx2_vpdpbsuds_128, INTR_TYPE_3OP, X86ISD::VPDPBSUDS, 0), + X86_INTRINSIC_DATA(avx2_vpdpbsuds_256, INTR_TYPE_3OP, X86ISD::VPDPBSUDS, 0), + X86_INTRINSIC_DATA(avx2_vpdpbuud_128, INTR_TYPE_3OP, X86ISD::VPDPBUUD, 0), + X86_INTRINSIC_DATA(avx2_vpdpbuud_256, INTR_TYPE_3OP, X86ISD::VPDPBUUD, 0), + X86_INTRINSIC_DATA(avx2_vpdpbuuds_128, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0), + X86_INTRINSIC_DATA(avx2_vpdpbuuds_256, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0), X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll @@ -0,0 +1,316 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X64 + + +declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbssd_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X86-NEXT: vpdpbssd (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0x18] +; X86-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2] +; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbssd_128: +; X64: # %bb.0: +; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X64-NEXT: vpdpbssd (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0x1f] +; X64-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2] +; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <4 x i32>, <4 x i32>* %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbssds_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X86-NEXT: vpdpbssds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x51,0x18] +; X86-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2] +; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbssds_128: +; X64: # %bb.0: +; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X64-NEXT: vpdpbssds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x51,0x1f] +; X64-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2] +; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <4 x i32>, <4 x i32>* %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbssd_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X86-NEXT: vpdpbssd (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x50,0x18] +; X86-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2] +; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbssd_256: +; X64: # %bb.0: +; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X64-NEXT: vpdpbssd (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x50,0x1f] +; X64-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2] +; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <8 x i32>, <8 x i32>* %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbssds_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X86-NEXT: vpdpbssds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0x18] +; X86-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2] +; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbssds_256: +; X64: # %bb.0: +; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X64-NEXT: vpdpbssds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0x1f] +; X64-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2] +; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <8 x i32>, <8 x i32>* %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbsud_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X86-NEXT: vpdpbsud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0x18] +; X86-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2] +; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbsud_128: +; X64: # %bb.0: +; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X64-NEXT: vpdpbsud (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0x1f] +; X64-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2] +; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <4 x i32>, <4 x i32>* %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbsuds_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X86-NEXT: vpdpbsuds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x51,0x18] +; X86-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2] +; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbsuds_128: +; X64: # %bb.0: +; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X64-NEXT: vpdpbsuds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x51,0x1f] +; X64-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2] +; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <4 x i32>, <4 x i32>* %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbsud_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X86-NEXT: vpdpbsud (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x50,0x18] +; X86-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2] +; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbsud_256: +; X64: # %bb.0: +; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X64-NEXT: vpdpbsud (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x50,0x1f] +; X64-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2] +; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <8 x i32>, <8 x i32>* %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbsuds_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X86-NEXT: vpdpbsuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0x18] +; X86-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2] +; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbsuds_256: +; X64: # %bb.0: +; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X64-NEXT: vpdpbsuds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0x1f] +; X64-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2] +; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <8 x i32>, <8 x i32>* %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbuud_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X86-NEXT: vpdpbuud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0x18] +; X86-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2] +; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbuud_128: +; X64: # %bb.0: +; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X64-NEXT: vpdpbuud (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0x1f] +; X64-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2] +; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <4 x i32>, <4 x i32>* %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbuuds_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X86-NEXT: vpdpbuuds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x51,0x18] +; X86-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2] +; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbuuds_128: +; X64: # %bb.0: +; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8] +; X64-NEXT: vpdpbuuds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x51,0x1f] +; X64-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2] +; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <4 x i32>, <4 x i32>* %x2p + %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %res = add <4 x i32> %1, %2 + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbuud_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X86-NEXT: vpdpbuud (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x50,0x18] +; X86-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2] +; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbuud_256: +; X64: # %bb.0: +; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X64-NEXT: vpdpbuud (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x50,0x1f] +; X64-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2] +; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <8 x i32>, <8 x i32>* %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) { +; X86-LABEL: test_int_x86_avx2_vpdpbuuds_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X86-NEXT: vpdpbuuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0x18] +; X86-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2] +; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx2_vpdpbuuds_256: +; X64: # %bb.0: +; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8] +; X64-NEXT: vpdpbuuds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0x1f] +; X64-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2] +; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %x2 = load <8 x i32>, <8 x i32>* %x2p + %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %res = add <8 x i32> %1, %2 + ret <8 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll @@ -0,0 +1,355 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 < %s | FileCheck %s + +declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <4 x i32> @stack_fold_vpdpbssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbssd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbssd_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + ret <4 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbssd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + ret <8 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbssd_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + ret <8 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbssds: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbssds_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + ret <4 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbssds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + ret <8 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbssds_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + ret <8 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbsud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbsud: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbsud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbsud_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: vpdpbsud %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + ret <4 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbsud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbsud_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + ret <8 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbsud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbsud_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; CHECK-NEXT: vpdpbsud %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + ret <8 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbsuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbsuds: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbsuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbsuds_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: vpdpbsuds %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + ret <4 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbsuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbsuds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + ret <8 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbsuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbsuds_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; CHECK-NEXT: vpdpbsuds %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + ret <8 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbuud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbuud: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbuud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbuud_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + ret <4 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbuud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbuud_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + ret <8 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbuud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbuud_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + ret <8 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbuuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbuuds: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbuuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbuuds_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + ret <4 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbuuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbuuds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + ret <8 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbuuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbuuds_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + ret <8 x i32> %2 +} diff --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int8-32.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-32.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-32.txt @@ -0,0 +1,243 @@ +# RUN: llvm-mc --disassemble %s -triple=i686 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vpdpbssd %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbssd ymm2, ymm3, ymm4 +0xc4,0xe2,0x67,0x50,0xd4 + +# ATT: vpdpbssd %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbssd xmm2, xmm3, xmm4 +0xc4,0xe2,0x63,0x50,0xd4 + +# ATT: vpdpbssd 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssd 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x67,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssd (%eax), %ymm3, %ymm2 +# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x67,0x50,0x10 + +# ATT: vpdpbssd -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbssd 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssd 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x63,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssd (%eax), %xmm3, %xmm2 +# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x63,0x50,0x10 + +# ATT: vpdpbssd -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbssds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbssds ymm2, ymm3, ymm4 +0xc4,0xe2,0x67,0x51,0xd4 + +# ATT: vpdpbssds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbssds xmm2, xmm3, xmm4 +0xc4,0xe2,0x63,0x51,0xd4 + +# ATT: vpdpbssds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssds 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x67,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssds (%eax), %ymm3, %ymm2 +# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x67,0x51,0x10 + +# ATT: vpdpbssds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbssds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssds 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x63,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssds (%eax), %xmm3, %xmm2 +# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x63,0x51,0x10 + +# ATT: vpdpbssds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbsud %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbsud ymm2, ymm3, ymm4 +0xc4,0xe2,0x66,0x50,0xd4 + +# ATT: vpdpbsud %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbsud xmm2, xmm3, xmm4 +0xc4,0xe2,0x62,0x50,0xd4 + +# ATT: vpdpbsud 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsud 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x66,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsud (%eax), %ymm3, %ymm2 +# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x66,0x50,0x10 + +# ATT: vpdpbsud -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbsud 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsud 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x62,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsud (%eax), %xmm3, %xmm2 +# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x62,0x50,0x10 + +# ATT: vpdpbsud -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbsuds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbsuds ymm2, ymm3, ymm4 +0xc4,0xe2,0x66,0x51,0xd4 + +# ATT: vpdpbsuds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbsuds xmm2, xmm3, xmm4 +0xc4,0xe2,0x62,0x51,0xd4 + +# ATT: vpdpbsuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsuds 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x66,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsuds (%eax), %ymm3, %ymm2 +# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x66,0x51,0x10 + +# ATT: vpdpbsuds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbsuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsuds 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x62,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsuds (%eax), %xmm3, %xmm2 +# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x62,0x51,0x10 + +# ATT: vpdpbsuds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbuud %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbuud ymm2, ymm3, ymm4 +0xc4,0xe2,0x64,0x50,0xd4 + +# ATT: vpdpbuud %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbuud xmm2, xmm3, xmm4 +0xc4,0xe2,0x60,0x50,0xd4 + +# ATT: vpdpbuud 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuud 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x64,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuud (%eax), %ymm3, %ymm2 +# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x64,0x50,0x10 + +# ATT: vpdpbuud -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbuud 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuud 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x60,0x50,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuud (%eax), %xmm3, %xmm2 +# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x60,0x50,0x10 + +# ATT: vpdpbuud -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbuuds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpbuuds ymm2, ymm3, ymm4 +0xc4,0xe2,0x64,0x51,0xd4 + +# ATT: vpdpbuuds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpbuuds xmm2, xmm3, xmm4 +0xc4,0xe2,0x60,0x51,0xd4 + +# ATT: vpdpbuuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuuds 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x64,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuuds (%eax), %ymm3, %ymm2 +# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x64,0x51,0x10 + +# ATT: vpdpbuuds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbuuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuuds 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x60,0x51,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuuds (%eax), %xmm3, %xmm2 +# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x60,0x51,0x10 + +# ATT: vpdpbuuds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff + diff --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int8-64.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-64.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-64.txt @@ -0,0 +1,243 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vpdpbssd %ymm14, %ymm13, %ymm12 +# INTEL: vpdpbssd ymm12, ymm13, ymm14 +0xc4,0x42,0x17,0x50,0xe6 + +# ATT: vpdpbssd %xmm14, %xmm13, %xmm12 +# INTEL: vpdpbssd xmm12, xmm13, xmm14 +0xc4,0x42,0x13,0x50,0xe6 + +# ATT: vpdpbssd 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x17,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssd 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x17,0x50,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssd (%rip), %ymm13, %ymm12 +# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x17,0x50,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbssd -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x17,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbssd 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x13,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssd 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x13,0x50,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssd (%rip), %xmm13, %xmm12 +# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x13,0x50,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbssd -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x13,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbssds %ymm14, %ymm13, %ymm12 +# INTEL: vpdpbssds ymm12, ymm13, ymm14 +0xc4,0x42,0x17,0x51,0xe6 + +# ATT: vpdpbssds %xmm14, %xmm13, %xmm12 +# INTEL: vpdpbssds xmm12, xmm13, xmm14 +0xc4,0x42,0x13,0x51,0xe6 + +# ATT: vpdpbssds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x17,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssds 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x17,0x51,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssds (%rip), %ymm13, %ymm12 +# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x17,0x51,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbssds -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x17,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbssds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x13,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbssds 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x13,0x51,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbssds (%rip), %xmm13, %xmm12 +# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x13,0x51,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbssds -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x13,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbsud %ymm14, %ymm13, %ymm12 +# INTEL: vpdpbsud ymm12, ymm13, ymm14 +0xc4,0x42,0x16,0x50,0xe6 + +# ATT: vpdpbsud %xmm14, %xmm13, %xmm12 +# INTEL: vpdpbsud xmm12, xmm13, xmm14 +0xc4,0x42,0x12,0x50,0xe6 + +# ATT: vpdpbsud 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x16,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsud 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x16,0x50,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsud (%rip), %ymm13, %ymm12 +# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x16,0x50,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbsud -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x16,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbsud 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x12,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsud 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x12,0x50,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsud (%rip), %xmm13, %xmm12 +# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x12,0x50,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbsud -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x12,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbsuds %ymm14, %ymm13, %ymm12 +# INTEL: vpdpbsuds ymm12, ymm13, ymm14 +0xc4,0x42,0x16,0x51,0xe6 + +# ATT: vpdpbsuds %xmm14, %xmm13, %xmm12 +# INTEL: vpdpbsuds xmm12, xmm13, xmm14 +0xc4,0x42,0x12,0x51,0xe6 + +# ATT: vpdpbsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x16,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsuds 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x16,0x51,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsuds (%rip), %ymm13, %ymm12 +# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x16,0x51,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbsuds -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x16,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x12,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbsuds 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x12,0x51,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbsuds (%rip), %xmm13, %xmm12 +# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x12,0x51,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbsuds -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x12,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbuud %ymm14, %ymm13, %ymm12 +# INTEL: vpdpbuud ymm12, ymm13, ymm14 +0xc4,0x42,0x14,0x50,0xe6 + +# ATT: vpdpbuud %xmm14, %xmm13, %xmm12 +# INTEL: vpdpbuud xmm12, xmm13, xmm14 +0xc4,0x42,0x10,0x50,0xe6 + +# ATT: vpdpbuud 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x14,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuud 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x14,0x50,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuud (%rip), %ymm13, %ymm12 +# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x14,0x50,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbuud -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x14,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbuud 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x10,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuud 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x10,0x50,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuud (%rip), %xmm13, %xmm12 +# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x10,0x50,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbuud -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x10,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpbuuds %ymm14, %ymm13, %ymm12 +# INTEL: vpdpbuuds ymm12, ymm13, ymm14 +0xc4,0x42,0x14,0x51,0xe6 + +# ATT: vpdpbuuds %xmm14, %xmm13, %xmm12 +# INTEL: vpdpbuuds xmm12, xmm13, xmm14 +0xc4,0x42,0x10,0x51,0xe6 + +# ATT: vpdpbuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x14,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuuds 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x14,0x51,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuuds (%rip), %ymm13, %ymm12 +# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x14,0x51,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbuuds -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x14,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpbuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x10,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpbuuds 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x10,0x51,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpbuuds (%rip), %xmm13, %xmm12 +# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x10,0x51,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpbuuds -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x10,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff + diff --git a/llvm/test/MC/X86/avx_vnni_int8-32-att.s b/llvm/test/MC/X86/avx_vnni_int8-32-att.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx_vnni_int8-32-att.s @@ -0,0 +1,241 @@ +// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnniint8 --show-encoding %s | FileCheck %s + +// CHECK: vpdpbssd %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0xd4] + vpdpbssd %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpbssd %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0xd4] + vpdpbssd %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpbssd 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssd 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpbssd 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssd 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpbssd (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x10] + vpdpbssd (%eax), %ymm3, %ymm2 + +// CHECK: vpdpbssd -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssd -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpbssd 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssd 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpbssd 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssd 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpbssd (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x10] + vpdpbssd (%eax), %xmm3, %xmm2 + +// CHECK: vpdpbssd -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssd -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vpdpbssds %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0xd4] + vpdpbssds %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpbssds %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0xd4] + vpdpbssds %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpbssds 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssds 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpbssds 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssds 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpbssds (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x10] + vpdpbssds (%eax), %ymm3, %ymm2 + +// CHECK: vpdpbssds -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssds -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpbssds 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssds 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpbssds 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssds 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpbssds (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x10] + vpdpbssds (%eax), %xmm3, %xmm2 + +// CHECK: vpdpbssds -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssds -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vpdpbsud %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0xd4] + vpdpbsud %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpbsud %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0xd4] + vpdpbsud %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpbsud 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsud 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpbsud 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsud 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpbsud (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x10] + vpdpbsud (%eax), %ymm3, %ymm2 + +// CHECK: vpdpbsud -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsud -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpbsud 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsud 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpbsud 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsud 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpbsud (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x10] + vpdpbsud (%eax), %xmm3, %xmm2 + +// CHECK: vpdpbsud -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsud -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vpdpbsuds %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0xd4] + vpdpbsuds %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpbsuds %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0xd4] + vpdpbsuds %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpbsuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsuds 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpbsuds 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsuds 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpbsuds (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x10] + vpdpbsuds (%eax), %ymm3, %ymm2 + +// CHECK: vpdpbsuds -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsuds -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpbsuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsuds 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpbsuds 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsuds 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpbsuds (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x10] + vpdpbsuds (%eax), %xmm3, %xmm2 + +// CHECK: vpdpbsuds -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsuds -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vpdpbuud %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0xd4] + vpdpbuud %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpbuud %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0xd4] + vpdpbuud %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpbuud 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuud 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpbuud 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuud 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpbuud (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x10] + vpdpbuud (%eax), %ymm3, %ymm2 + +// CHECK: vpdpbuud -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuud -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpbuud 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuud 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpbuud 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuud 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpbuud (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x10] + vpdpbuud (%eax), %xmm3, %xmm2 + +// CHECK: vpdpbuud -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuud -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vpdpbuuds %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0xd4] + vpdpbuuds %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpbuuds %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0xd4] + vpdpbuuds %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpbuuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuuds 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpbuuds 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuuds 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpbuuds (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x10] + vpdpbuuds (%eax), %ymm3, %ymm2 + +// CHECK: vpdpbuuds -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuuds -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpbuuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuuds 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpbuuds 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuuds 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpbuuds (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x10] + vpdpbuuds (%eax), %xmm3, %xmm2 + +// CHECK: vpdpbuuds -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuuds -512(,%ebp,2), %xmm3, %xmm2 diff --git a/llvm/test/MC/X86/avx_vnni_int8-32-intel.s b/llvm/test/MC/X86/avx_vnni_int8-32-intel.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx_vnni_int8-32-intel.s @@ -0,0 +1,242 @@ +// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnniint8 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vpdpbssd ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0xd4] + vpdpbssd ymm2, ymm3, ymm4 + +// CHECK: vpdpbssd xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0xd4] + vpdpbssd xmm2, xmm3, xmm4 + +// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x10] + vpdpbssd ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x10] + vpdpbssd xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpbssds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0xd4] + vpdpbssds ymm2, ymm3, ymm4 + +// CHECK: vpdpbssds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0xd4] + vpdpbssds xmm2, xmm3, xmm4 + +// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x10] + vpdpbssds ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x10] + vpdpbssds xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpbsud ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0xd4] + vpdpbsud ymm2, ymm3, ymm4 + +// CHECK: vpdpbsud xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0xd4] + vpdpbsud xmm2, xmm3, xmm4 + +// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x10] + vpdpbsud ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x10] + vpdpbsud xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpbsuds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0xd4] + vpdpbsuds ymm2, ymm3, ymm4 + +// CHECK: vpdpbsuds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0xd4] + vpdpbsuds xmm2, xmm3, xmm4 + +// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x10] + vpdpbsuds ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x10] + vpdpbsuds xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpbuud ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0xd4] + vpdpbuud ymm2, ymm3, ymm4 + +// CHECK: vpdpbuud xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0xd4] + vpdpbuud xmm2, xmm3, xmm4 + +// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x10] + vpdpbuud ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x10] + vpdpbuud xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpbuuds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0xd4] + vpdpbuuds ymm2, ymm3, ymm4 + +// CHECK: vpdpbuuds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0xd4] + vpdpbuuds xmm2, xmm3, xmm4 + +// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x10] + vpdpbuuds ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x10] + vpdpbuuds xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] + diff --git a/llvm/test/MC/X86/avx_vnni_int8-64-att.s b/llvm/test/MC/X86/avx_vnni_int8-64-att.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx_vnni_int8-64-att.s @@ -0,0 +1,242 @@ +// RUN: llvm-mc -triple=x86_64-unknown-unknown -mattr=+avxvnniint8 --show-encoding < %s | FileCheck %s + +// CHECK: vpdpbssd %ymm14, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xe6] + vpdpbssd %ymm14, %ymm13, %ymm12 + +// CHECK: vpdpbssd %xmm14, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xe6] + vpdpbssd %xmm14, %xmm13, %xmm12 + +// CHECK: vpdpbssd 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x17,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssd 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpbssd 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbssd 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpbssd (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbssd (%rip), %ymm13, %ymm12 + +// CHECK: vpdpbssd -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssd -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpbssd 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x13,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssd 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpbssd 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbssd 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpbssd (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbssd (%rip), %xmm13, %xmm12 + +// CHECK: vpdpbssd -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssd -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: vpdpbssds %ymm14, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xe6] + vpdpbssds %ymm14, %ymm13, %ymm12 + +// CHECK: vpdpbssds %xmm14, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xe6] + vpdpbssds %xmm14, %xmm13, %xmm12 + +// CHECK: vpdpbssds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x17,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssds 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpbssds 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbssds 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpbssds (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbssds (%rip), %ymm13, %ymm12 + +// CHECK: vpdpbssds -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssds -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpbssds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x13,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssds 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpbssds 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbssds 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpbssds (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbssds (%rip), %xmm13, %xmm12 + +// CHECK: vpdpbssds -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssds -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: vpdpbsud %ymm14, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xe6] + vpdpbsud %ymm14, %ymm13, %ymm12 + +// CHECK: vpdpbsud %xmm14, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xe6] + vpdpbsud %xmm14, %xmm13, %xmm12 + +// CHECK: vpdpbsud 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x16,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsud 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpbsud 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbsud 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpbsud (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbsud (%rip), %ymm13, %ymm12 + +// CHECK: vpdpbsud -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsud -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpbsud 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x12,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsud 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpbsud 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbsud 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpbsud (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbsud (%rip), %xmm13, %xmm12 + +// CHECK: vpdpbsud -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsud -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: vpdpbsuds %ymm14, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xe6] + vpdpbsuds %ymm14, %ymm13, %ymm12 + +// CHECK: vpdpbsuds %xmm14, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xe6] + vpdpbsuds %xmm14, %xmm13, %xmm12 + +// CHECK: vpdpbsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x16,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpbsuds 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbsuds 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpbsuds (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbsuds (%rip), %ymm13, %ymm12 + +// CHECK: vpdpbsuds -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsuds -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpbsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x12,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpbsuds 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbsuds 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpbsuds (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbsuds (%rip), %xmm13, %xmm12 + +// CHECK: vpdpbsuds -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsuds -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: vpdpbuud %ymm14, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xe6] + vpdpbuud %ymm14, %ymm13, %ymm12 + +// CHECK: vpdpbuud %xmm14, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xe6] + vpdpbuud %xmm14, %xmm13, %xmm12 + +// CHECK: vpdpbuud 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x14,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuud 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpbuud 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbuud 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpbuud (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbuud (%rip), %ymm13, %ymm12 + +// CHECK: vpdpbuud -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuud -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpbuud 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x10,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuud 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpbuud 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbuud 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpbuud (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbuud (%rip), %xmm13, %xmm12 + +// CHECK: vpdpbuud -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuud -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: vpdpbuuds %ymm14, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xe6] + vpdpbuuds %ymm14, %ymm13, %ymm12 + +// CHECK: vpdpbuuds %xmm14, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xe6] + vpdpbuuds %xmm14, %xmm13, %xmm12 + +// CHECK: vpdpbuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x14,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpbuuds 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbuuds 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpbuuds (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbuuds (%rip), %ymm13, %ymm12 + +// CHECK: vpdpbuuds -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuuds -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpbuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x10,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpbuuds 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbuuds 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpbuuds (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbuuds (%rip), %xmm13, %xmm12 + +// CHECK: vpdpbuuds -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuuds -512(,%rbp,2), %xmm13, %xmm12 + diff --git a/llvm/test/MC/X86/avx_vnni_int8-64-intel.s b/llvm/test/MC/X86/avx_vnni_int8-64-intel.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx_vnni_int8-64-intel.s @@ -0,0 +1,242 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnniint8 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vpdpbssd ymm12, ymm13, ymm14 +// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xe6] + vpdpbssd ymm12, ymm13, ymm14 + +// CHECK: vpdpbssd xmm12, xmm13, xmm14 +// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xe6] + vpdpbssd xmm12, xmm13, xmm14 + +// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x17,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbssd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbssd ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssd ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x13,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbssd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbssd xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssd xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: vpdpbssds ymm12, ymm13, ymm14 +// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xe6] + vpdpbssds ymm12, ymm13, ymm14 + +// CHECK: vpdpbssds xmm12, xmm13, xmm14 +// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xe6] + vpdpbssds xmm12, xmm13, xmm14 + +// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x17,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbssds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbssds ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbssds ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x13,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbssds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbssds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbssds xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbssds xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: vpdpbsud ymm12, ymm13, ymm14 +// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xe6] + vpdpbsud ymm12, ymm13, ymm14 + +// CHECK: vpdpbsud xmm12, xmm13, xmm14 +// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xe6] + vpdpbsud xmm12, xmm13, xmm14 + +// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x16,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbsud ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsud ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x12,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbsud xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsud xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: vpdpbsuds ymm12, ymm13, ymm14 +// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xe6] + vpdpbsuds ymm12, ymm13, ymm14 + +// CHECK: vpdpbsuds xmm12, xmm13, xmm14 +// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xe6] + vpdpbsuds xmm12, xmm13, xmm14 + +// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x16,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbsuds ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x12,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbsuds xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbsuds xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: vpdpbuud ymm12, ymm13, ymm14 +// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xe6] + vpdpbuud ymm12, ymm13, ymm14 + +// CHECK: vpdpbuud xmm12, xmm13, xmm14 +// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xe6] + vpdpbuud xmm12, xmm13, xmm14 + +// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x14,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbuud ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuud ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x10,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x25,0x00,0x00,0x00,0x00] + vpdpbuud xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuud xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: vpdpbuuds ymm12, ymm13, ymm14 +// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xe6] + vpdpbuuds ymm12, ymm13, ymm14 + +// CHECK: vpdpbuuds xmm12, xmm13, xmm14 +// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xe6] + vpdpbuuds xmm12, xmm13, xmm14 + +// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x14,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbuuds ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpbuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x10,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpbuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpbuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x25,0x00,0x00,0x00,0x00] + vpdpbuuds xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpbuuds xmm12, xmm13, xmmword ptr [2*rbp - 512] +