diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -700,6 +700,10 @@ - Add ISA of ``AMX-COMPLEX`` which supports ``tcmmimfp16ps`` and ``tcmmrlfp16ps``. +- Support ISA of ``AVX-VNNI-INT16``. + * Support intrinsic of ``_mm(256)_dpwsud(s)_epi32``. + * Support intrinsic of ``_mm(256)_dpwusd(s)_epi32``. + * Support intrinsic of ``_mm(256)_dpwuud(s)_epi32``. Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -2116,6 +2116,20 @@ TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "") +// AVX-VNNI-INT16 +TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16") +TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16") + // AVX-NE-CONVERT TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert") TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps256, "V8fyC*", "nV:256:", "avxneconvert") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4947,6 +4947,8 @@ def mno_avxifma : Flag<["-"], "mno-avxifma">, Group; def mavxneconvert : Flag<["-"], "mavxneconvert">, Group; def mno_avxneconvert : Flag<["-"], "mno-avxneconvert">, Group; +def mavxvnniint16 : Flag<["-"], "mavxvnniint16">, Group; +def mno_avxvnniint16 : Flag<["-"], "mno-avxvnniint16">, Group; def mavxvnniint8 : Flag<["-"], "mavxvnniint8">, Group; def mno_avxvnniint8 : Flag<["-"], "mno-avxvnniint8">, Group; def mavxvnni : Flag<["-"], "mavxvnni">, Group; diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -142,6 +142,7 @@ bool HasPTWRITE = false; bool HasINVPCID = false; bool HasENQCMD = false; + bool HasAVXVNNIINT16 = false; bool HasAMXFP16 = false; bool HasCMPCCXADD = false; bool HasRAOINT = false; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -347,6 +347,8 @@ HasAVXNECONVERT= true; } else if (Feature == "+avxvnni") { HasAVXVNNI = true; + } else if (Feature == "+avxvnniint16") { + HasAVXVNNIINT16 = true; } else if (Feature == "+avxvnniint8") { HasAVXVNNIINT8 = true; } else if (Feature == "+serialize") { @@ -824,6 +826,8 @@ Builder.defineMacro("__AVXNECONVERT__"); if (HasAVXVNNI) Builder.defineMacro("__AVXVNNI__"); + if (HasAVXVNNIINT16) + Builder.defineMacro("__AVXVNNIINT16__"); if (HasAVXVNNIINT8) Builder.defineMacro("__AVXVNNIINT8__"); if (HasSERIALIZE) @@ -952,6 +956,7 @@ .Case("avxifma", true) .Case("avxneconvert", true) .Case("avxvnni", true) + .Case("avxvnniint16", true) .Case("avxvnniint8", true) .Case("bmi", true) .Case("bmi2", true) @@ -1054,6 +1059,7 @@ .Case("avxifma", HasAVXIFMA) .Case("avxneconvert", HasAVXNECONVERT) .Case("avxvnni", HasAVXVNNI) + .Case("avxvnniint16", HasAVXVNNIINT16) .Case("avxvnniint8", HasAVXVNNIINT8) .Case("bmi", HasBMI) .Case("bmi2", HasBMI2) diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -160,6 +160,7 @@ avxifmaintrin.h avxintrin.h avxneconvertintrin.h + avxvnniint16intrin.h avxvnniint8intrin.h avxvnniintrin.h bmi2intrin.h diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/avxvnniint16intrin.h @@ -0,0 +1,473 @@ +/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AVXVNNIINT16INTRIN_H +#define __AVXVNNIINT16INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ + __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ + __min_vector_width__(256))) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWSUD instruction. +/// +/// \param __W +/// A 128-bit vector of [4 x int]. +/// \param __A +/// A 128-bit vector of [8 x short]. +/// \param __B +/// A 128-bit vector of [8 x unsigned short]. +/// \returns +/// A 128-bit vector of [4 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWSUD instruction. +/// +/// \param __W +/// A 256-bit vector of [8 x int]. +/// \param __A +/// A 256-bit vector of [16 x short]. +/// \param __B +/// A 256-bit vector of [16 x unsigned short]. +/// \returns +/// A 256-bit vector of [8 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// +/// \param __W +/// A 128-bit vector of [4 x int]. +/// \param __A +/// A 128-bit vector of [8 x short]. +/// \param __B +/// A 128-bit vector of [8 x unsigned short]. +/// \returns +/// A 128-bit vector of [4 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// +/// \param __W +/// A 256-bit vector of [8 x int]. +/// \param __A +/// A 256-bit vector of [16 x short]. +/// \param __B +/// A 256-bit vector of [16 x unsigned short]. +/// \returns +/// A 256-bit vector of [8 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with +/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWUSD instruction. +/// +/// \param __W +/// A 128-bit vector of [4 x int]. +/// \param __A +/// A 128-bit vector of [8 x unsigned short]. +/// \param __B +/// A 128-bit vector of [8 x short]. +/// \returns +/// A 128-bit vector of [4 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with +/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWUSD instruction. +/// +/// \param __W +/// A 256-bit vector of [8 x int]. +/// \param __A +/// A 256-bit vector of [16 x unsigned short]. +/// \param __B +/// A 256-bit vector of [16 x short]. +/// \returns +/// A 256-bit vector of [8 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with +/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// +/// \param __W +/// A 128-bit vector of [4 x int]. +/// \param __A +/// A 128-bit vector of [8 x unsigned short]. +/// \param __B +/// A 128-bit vector of [8 x short]. +/// \returns +/// A 128-bit vector of [4 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with +/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// +/// \param __W +/// A 256-bit vector of [8 x int]. +/// \param __A +/// A 256-bit vector of [16 x unsigned short]. +/// \param __B +/// A 256-bit vector of [16 x short]. +/// \returns +/// A 256-bit vector of [8 x int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with +/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWUUD instruction. +/// +/// \param __W +/// A 128-bit vector of [4 x unsigned int]. +/// \param __A +/// A 128-bit vector of [8 x unsigned short]. +/// \param __B +/// A 128-bit vector of [8 x unsigned short]. +/// \returns +/// A 128-bit vector of [4 x unsigned int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) +/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with +/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWUUD instruction. +/// +/// \param __W +/// A 256-bit vector of [8 x unsigned int]. +/// \param __A +/// A 256-bit vector of [16 x unsigned short]. +/// \param __B +/// A 256-bit vector of [16 x unsigned short]. +/// \returns +/// A 256-bit vector of [8 x unsigned int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) +/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with +/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// +/// \param __W +/// A 128-bit vector of [4 x unsigned int]. +/// \param __A +/// A 128-bit vector of [8 x unsigned short]. +/// \param __B +/// A 128-bit vector of [8 x unsigned short]. +/// \returns +/// A 128-bit vector of [4 x unsigned int]. +/// +/// \code{.operation} +/// FOR j := 0 to 3 +/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) +/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// dst[MAX:128] := 0 +/// \endcode +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W, + __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A, + (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with +/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate +/// signed 16-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in \a __W with signed saturation, and store the packed +/// 32-bit results in \a dst. +/// +/// \headerfile +/// +/// \code +/// _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B); +/// \endcode +/// +/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// +/// \param __W +/// A 256-bit vector of [8 x unsigned int]. +/// \param __A +/// A 256-bit vector of [16 x unsigned short]. +/// \param __B +/// A 256-bit vector of [16 x unsigned short]. +/// \returns +/// A 256-bit vector of [8 x unsigned int]. +/// +/// \code{.operation} +/// FOR j := 0 to 7 +/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) +/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// dst[MAX:256] := 0 +/// \endcode +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A, + (__v8si)__B); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif // __AVXVNNIINT16INTRIN_H diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -269,6 +269,11 @@ #include #endif +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVXVNNIINT16__) +#include +#endif + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__RDPID__) /// Returns the value of the IA32_TSC_AUX MSR (0xc0000103). diff --git a/clang/test/CodeGen/X86/avxvnniint16-builtins.c b/clang/test/CodeGen/X86/avxvnniint16-builtins.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/X86/avxvnniint16-builtins.c @@ -0,0 +1,76 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s + +#include + +__m128i test_mm_dpwsud_epi32(__m128i __A, __m128i __B, __m128i __C) { + // CHECK-LABEL: @test_mm_dpwsud_epi32( + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + return _mm_dpwsud_epi32(__A, __B, __C); +} + +__m256i test_mm256_dpwsud_epi32(__m256i __A, __m256i __B, __m256i __C) { + // CHECK-LABEL: @test_mm256_dpwsud_epi32( + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + return _mm256_dpwsud_epi32(__A, __B, __C); +} + +__m128i test_mm_dpwsuds_epi32(__m128i __A, __m128i __B, __m128i __C) { + // CHECK-LABEL: @test_mm_dpwsuds_epi32( + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + return _mm_dpwsuds_epi32(__A, __B, __C); +} + +__m256i test_mm256_dpwsuds_epi32(__m256i __A, __m256i __B, __m256i __C) { + // CHECK-LABEL: @test_mm256_dpwsuds_epi32( + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + return _mm256_dpwsuds_epi32(__A, __B, __C); +} + +__m128i test_mm_dpwusd_epi32(__m128i __A, __m128i __B, __m128i __C) { + // CHECK-LABEL: @test_mm_dpwusd_epi32( + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + return _mm_dpwusd_epi32(__A, __B, __C); +} + +__m256i test_mm256_dpwusd_epi32(__m256i __A, __m256i __B, __m256i __C) { + // CHECK-LABEL: @test_mm256_dpwusd_epi32( + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + return _mm256_dpwusd_epi32(__A, __B, __C); +} + +__m128i test_mm_dpwusds_epi32(__m128i __A, __m128i __B, __m128i __C) { + // CHECK-LABEL: @test_mm_dpwusds_epi32( + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + return _mm_dpwusds_epi32(__A, __B, __C); +} + +__m256i test_mm256_dpwusds_epi32(__m256i __A, __m256i __B, __m256i __C) { + // CHECK-LABEL: @test_mm256_dpwusds_epi32( + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + return _mm256_dpwusds_epi32(__A, __B, __C); +} + +__m128i test_mm_dpwuud_epi32(__m128i __A, __m128i __B, __m128i __C) { + // CHECK-LABEL: @test_mm_dpwuud_epi32( + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + return _mm_dpwuud_epi32(__A, __B, __C); +} + +__m256i test_mm256_dpwuud_epi32(__m256i __A, __m256i __B, __m256i __C) { + // CHECK-LABEL: @test_mm256_dpwuud_epi32( + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + return _mm256_dpwuud_epi32(__A, __B, __C); +} + +__m128i test_mm_dpwuuds_epi32(__m128i __A, __m128i __B, __m128i __C) { + // CHECK-LABEL: @test_mm_dpwuuds_epi32( + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + return _mm_dpwuuds_epi32(__A, __B, __C); +} + +__m256i test_mm256_dpwuuds_epi32(__m256i __A, __m256i __B, __m256i __C) { + // CHECK-LABEL: @test_mm256_dpwuuds_epi32( + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + return _mm256_dpwuuds_epi32(__A, __B, __C); +} diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c --- a/clang/test/CodeGen/attr-target-x86.c +++ b/clang/test/CodeGen/attr-target-x86.c @@ -54,9 +54,9 @@ // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="i686" // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-NOT: tune-cpu -// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" +// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686" -// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" +// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes" // CHECK-NOT: tune-cpu // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-3dnow,-3dnowa,-mmx" diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -349,6 +349,11 @@ // AVXNECONVERT: "-target-feature" "+avxneconvert" // NO-AVXNECONVERT: "-target-feature" "-avxneconvert" +// RUN: %clang --target=i386 -mavxvnniint16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVXVNNIINT16 %s +// RUN: %clang --target=i386 -mno-avxvnniint16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVXVNNIINT16 %s +// AVXVNNIINT16: "-target-feature" "+avxvnniint16" +// NO-AVXVNNIINT16: "-target-feature" "-avxvnniint16" + // RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s // RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s // CRC32: "-target-feature" "+crc32" diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -660,6 +660,20 @@ // AVXNECONVERTNOAVX2-NOT: #define __AVX2__ 1 // AVXNECONVERTNOAVX2-NOT: #define __AVXNECONVERT__ 1 +// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavxvnniint16 -x c -E -dM -o - %s | FileCheck -check-prefix=AVXVNNIINT16 %s + +// AVXVNNIINT16: #define __AVX2__ 1 +// AVXVNNIINT16: #define __AVXVNNIINT16__ 1 + +// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mno-avxvnniint16 -x c -E -dM -o - %s | FileCheck -check-prefix=NOAVXVNNIINT16 %s + +// NOAVXVNNIINT16-NOT: #define __AVXVNNIINT16__ 1 + +// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavxvnniint16 -mno-avx2 -x c -E -dM -o - %s | FileCheck -check-prefix=AVXVNNIINT16NOAVX2 %s + +// AVXVNNIINT16NOAVX2-NOT: #define __AVX2__ 1 +// AVXVNNIINT16NOAVX2-NOT: #define __AVXVNNIINT16__ 1 + // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mcrc32 -x c -E -dM -o - %s | FileCheck -check-prefix=CRC32 %s // CRC32: #define __CRC32__ 1 diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -276,7 +276,7 @@ * ``__builtin_unpredictable`` (unpredictable metadata in LLVM IR), is handled by X86 Backend. ``X86CmovConversion`` pass now respects this builtin and does not convert CMOVs to branches. - +* Support ISA of ``AVX-VNNI-INT16``. Changes to the OCaml bindings ----------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -2053,6 +2053,67 @@ DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; + + def int_x86_avx2_vpdpwsud_128 + : ClangBuiltin<"__builtin_ia32_vpdpwsud128">, + DefaultAttrsIntrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpwsud_256 + : ClangBuiltin<"__builtin_ia32_vpdpwsud256">, + DefaultAttrsIntrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpwsuds_128 + : ClangBuiltin<"__builtin_ia32_vpdpwsuds128">, + DefaultAttrsIntrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpwsuds_256 + : ClangBuiltin<"__builtin_ia32_vpdpwsuds256">, + DefaultAttrsIntrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpwusd_128 + : ClangBuiltin<"__builtin_ia32_vpdpwusd128">, + DefaultAttrsIntrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpwusd_256 + : ClangBuiltin<"__builtin_ia32_vpdpwusd256">, + DefaultAttrsIntrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpwusds_128 + : ClangBuiltin<"__builtin_ia32_vpdpwusds128">, + DefaultAttrsIntrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpwusds_256 + : ClangBuiltin<"__builtin_ia32_vpdpwusds256">, + DefaultAttrsIntrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpwuud_128 + : ClangBuiltin<"__builtin_ia32_vpdpwuud128">, + DefaultAttrsIntrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpwuud_256 + : ClangBuiltin<"__builtin_ia32_vpdpwuud256">, + DefaultAttrsIntrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpwuuds_128 + : ClangBuiltin<"__builtin_ia32_vpdpwuuds128">, + DefaultAttrsIntrinsic<[llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_vpdpwuuds_256 + : ClangBuiltin<"__builtin_ia32_vpdpwuuds256">, + DefaultAttrsIntrinsic<[llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; } //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -227,6 +227,7 @@ X86_FEATURE (AVXVNNI, "avxvnni") X86_FEATURE (AVXIFMA, "avxifma") X86_FEATURE (AVXVNNIINT8, "avxvnniint8") +X86_FEATURE (AVXVNNIINT16, "avxvnniint16") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -191,6 +191,10 @@ "HasAVXVNNIINT8", "true", "Enable AVX-VNNI-INT8", [FeatureAVX2]>; +def FeatureAVXVNNIINT16 : SubtargetFeature<"avxvnniint16", + "HasAVXVNNIINT16", "true", + "Enable AVX-VNNI-INT16", + [FeatureAVX2]>; def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2542,6 +2542,10 @@ case X86::VPDPWSSDrr: case X86::VPDPWSSDSYrr: case X86::VPDPWSSDSrr: + case X86::VPDPWUUDrr: + case X86::VPDPWUUDYrr: + case X86::VPDPWUUDSrr: + case X86::VPDPWUUDSYrr: case X86::VPDPBSSDSrr: case X86::VPDPBSSDSYrr: case X86::VPDPBSSDrr: diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -924,6 +924,7 @@ def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">; def HasBF16 : Predicate<"Subtarget->hasBF16()">; def HasFP16 : Predicate<"Subtarget->hasFP16()">; +def HasAVXVNNIINT16 : Predicate<"Subtarget->hasAVXVNNIINT16()">; def HasAVXVNNIINT8 : Predicate<"Subtarget->hasAVXVNNIINT8()">; def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">; def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8295,3 +8295,47 @@ (VCVTNEPS2BF16rr VR128:$dst, VR128:$src), 0, "att">; def : InstAlias<"vcvtneps2bf16y\t{$src, $dst|$dst, $src}", (VCVTNEPS2BF16Yrr VR128:$dst, VR256:$src), 0, "att">; + +let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in +multiclass avx_vnni_int16 opc, string OpcodeStr, bit IsCommutable> { + let isCommutable = IsCommutable in + def rr : I("int_x86_avx2_"#OpcodeStr#"_128") + VR128:$src1, VR128:$src2, VR128:$src3)))]>, + VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; + + def rm : I("int_x86_avx2_"#OpcodeStr#"_128") + VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>, + VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; + + let isCommutable = IsCommutable in + def Yrr : I("int_x86_avx2_"#OpcodeStr#"_256") + VR256:$src1, VR256:$src2, VR256:$src3)))]>, + VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; + + def Yrm : I("int_x86_avx2_"#OpcodeStr#"_256") + VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>, + VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; +} + +defm VPDPWSUD : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8XS; +defm VPDPWSUDS : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8XS; +defm VPDPWUSD : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8PD; +defm VPDPWUSDS : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8PD; +defm VPDPWUUD : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8PS; +defm VPDPWUUDS : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8PS; diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1756,6 +1756,7 @@ Features["avxvnniint8"] = HasLeaf7Subleaf1 && ((EDX >> 4) & 1) && HasAVXSave; Features["avxneconvert"] = HasLeaf7Subleaf1 && ((EDX >> 5) & 1) && HasAVXSave; Features["amx-complex"] = HasLeaf7Subleaf1 && ((EDX >> 8) & 1) && HasAMXSave; + Features["avxvnniint16"] = HasLeaf7Subleaf1 && ((EDX >> 10) & 1) && HasAVXSave; Features["prefetchi"] = HasLeaf7Subleaf1 && ((EDX >> 14) & 1); bool HasLeafD = MaxLevel >= 0xd && diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -652,6 +652,7 @@ constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {}; constexpr FeatureBitset ImpliedFeaturesCMPCCXADD = {}; constexpr FeatureBitset ImpliedFeaturesRAOINT = {}; +constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT16 = FeatureAVX2; constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT8 = FeatureAVX2; constexpr FeatureBitset ImpliedFeaturesAVXIFMA = FeatureAVX2; constexpr FeatureBitset ImpliedFeaturesAVXNECONVERT = FeatureAVX2; diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s + +define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll @@ -0,0 +1,271 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O3 -disable-peephole -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s + +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwsud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x72,0xd2,0x44,0x24,0xe8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} + +define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwsud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x76,0xd2,0x44,0x24,0xd8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} + +define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwsuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x72,0xd3,0x44,0x24,0xe8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} + +define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwsuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x76,0xd3,0x44,0x24,0xd8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} + +define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x71,0xd2,0x44,0x24,0xe8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} + +define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x75,0xd2,0x44,0x24,0xd8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} + +define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x71,0xd3,0x44,0x24,0xe8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} + +define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x75,0xd3,0x44,0x24,0xd8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} + +define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x70,0xd2,0x44,0x24,0xe8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} + +define <4 x i32> @test_int_x86_avx2_vpdpwuud_128_commuted(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x70,0xd2,0x44,0x24,0xe8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %C, <4 x i32> %B) + ret <4 x i32> %ret +} + +define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x74,0xd2,0x44,0x24,0xd8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} + +define <8 x i32> @test_int_x86_avx2_vpdpwuud_256_commuted(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x74,0xd2,0x44,0x24,0xd8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %C, <8 x i32> %B) + ret <8 x i32> %ret +} + +define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x70,0xd3,0x44,0x24,0xe8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} + +define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128_commuted(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x70,0xd3,0x44,0x24,0xe8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %C, <4 x i32> %B) + ret <4 x i32> %ret +} + +define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x74,0xd3,0x44,0x24,0xd8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} + +define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256_commuted(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: # encoding: [0xc4,0xe2,0x74,0xd3,0x44,0x24,0xd8] +; CHECK-NEXT: retq # encoding: [0xc3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %C, <8 x i32> %B) + ret <8 x i32> %ret +} diff --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int16-32.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-32.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-32.txt @@ -0,0 +1,339 @@ +# RUN: llvm-mc --disassemble %s -triple=i386-unknown-unknown | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=i386-unknown-unknown --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vpdpwsud %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwsud ymm2, ymm3, ymm4 +0xc4,0xe2,0x66,0xd2,0xd4 + +# ATT: vpdpwsud %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwsud xmm2, xmm3, xmm4 +0xc4,0xe2,0x62,0xd2,0xd4 + +# ATT: vpdpwsud 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsud 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x66,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsud (%eax), %ymm3, %ymm2 +# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x66,0xd2,0x10 + +# ATT: vpdpwsud -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwsud 4064(%ecx), %ymm3, %ymm2 +# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [ecx + 4064] +0xc4,0xe2,0x66,0xd2,0x91,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwsud -4096(%edx), %ymm3, %ymm2 +# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [edx - 4096] +0xc4,0xe2,0x66,0xd2,0x92,0x00,0xf0,0xff,0xff + +# ATT: vpdpwsud 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsud 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x62,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsud (%eax), %xmm3, %xmm2 +# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x62,0xd2,0x10 + +# ATT: vpdpwsud -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwsud 2032(%ecx), %xmm3, %xmm2 +# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [ecx + 2032] +0xc4,0xe2,0x62,0xd2,0x91,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwsud -2048(%edx), %xmm3, %xmm2 +# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [edx - 2048] +0xc4,0xe2,0x62,0xd2,0x92,0x00,0xf8,0xff,0xff + +# ATT: vpdpwsuds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwsuds ymm2, ymm3, ymm4 +0xc4,0xe2,0x66,0xd3,0xd4 + +# ATT: vpdpwsuds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwsuds xmm2, xmm3, xmm4 +0xc4,0xe2,0x62,0xd3,0xd4 + +# ATT: vpdpwsuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsuds 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x66,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsuds (%eax), %ymm3, %ymm2 +# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x66,0xd3,0x10 + +# ATT: vpdpwsuds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwsuds 4064(%ecx), %ymm3, %ymm2 +# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [ecx + 4064] +0xc4,0xe2,0x66,0xd3,0x91,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwsuds -4096(%edx), %ymm3, %ymm2 +# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [edx - 4096] +0xc4,0xe2,0x66,0xd3,0x92,0x00,0xf0,0xff,0xff + +# ATT: vpdpwsuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsuds 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x62,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsuds (%eax), %xmm3, %xmm2 +# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x62,0xd3,0x10 + +# ATT: vpdpwsuds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwsuds 2032(%ecx), %xmm3, %xmm2 +# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [ecx + 2032] +0xc4,0xe2,0x62,0xd3,0x91,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwsuds -2048(%edx), %xmm3, %xmm2 +# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [edx - 2048] +0xc4,0xe2,0x62,0xd3,0x92,0x00,0xf8,0xff,0xff + +# ATT: vpdpwusd %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwusd ymm2, ymm3, ymm4 +0xc4,0xe2,0x65,0xd2,0xd4 + +# ATT: vpdpwusd %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwusd xmm2, xmm3, xmm4 +0xc4,0xe2,0x61,0xd2,0xd4 + +# ATT: vpdpwusd 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusd 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x65,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusd (%eax), %ymm3, %ymm2 +# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x65,0xd2,0x10 + +# ATT: vpdpwusd -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwusd 4064(%ecx), %ymm3, %ymm2 +# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [ecx + 4064] +0xc4,0xe2,0x65,0xd2,0x91,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwusd -4096(%edx), %ymm3, %ymm2 +# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [edx - 4096] +0xc4,0xe2,0x65,0xd2,0x92,0x00,0xf0,0xff,0xff + +# ATT: vpdpwusd 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusd 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x61,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusd (%eax), %xmm3, %xmm2 +# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x61,0xd2,0x10 + +# ATT: vpdpwusd -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwusd 2032(%ecx), %xmm3, %xmm2 +# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [ecx + 2032] +0xc4,0xe2,0x61,0xd2,0x91,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwusd -2048(%edx), %xmm3, %xmm2 +# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [edx - 2048] +0xc4,0xe2,0x61,0xd2,0x92,0x00,0xf8,0xff,0xff + +# ATT: vpdpwusds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwusds ymm2, ymm3, ymm4 +0xc4,0xe2,0x65,0xd3,0xd4 + +# ATT: vpdpwusds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwusds xmm2, xmm3, xmm4 +0xc4,0xe2,0x61,0xd3,0xd4 + +# ATT: vpdpwusds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusds 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x65,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusds (%eax), %ymm3, %ymm2 +# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x65,0xd3,0x10 + +# ATT: vpdpwusds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwusds 4064(%ecx), %ymm3, %ymm2 +# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [ecx + 4064] +0xc4,0xe2,0x65,0xd3,0x91,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwusds -4096(%edx), %ymm3, %ymm2 +# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [edx - 4096] +0xc4,0xe2,0x65,0xd3,0x92,0x00,0xf0,0xff,0xff + +# ATT: vpdpwusds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusds 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x61,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusds (%eax), %xmm3, %xmm2 +# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x61,0xd3,0x10 + +# ATT: vpdpwusds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwusds 2032(%ecx), %xmm3, %xmm2 +# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [ecx + 2032] +0xc4,0xe2,0x61,0xd3,0x91,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwusds -2048(%edx), %xmm3, %xmm2 +# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [edx - 2048] +0xc4,0xe2,0x61,0xd3,0x92,0x00,0xf8,0xff,0xff + +# ATT: vpdpwuud %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwuud ymm2, ymm3, ymm4 +0xc4,0xe2,0x64,0xd2,0xd4 + +# ATT: vpdpwuud %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwuud xmm2, xmm3, xmm4 +0xc4,0xe2,0x60,0xd2,0xd4 + +# ATT: vpdpwuud 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuud 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x64,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuud (%eax), %ymm3, %ymm2 +# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x64,0xd2,0x10 + +# ATT: vpdpwuud -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwuud 4064(%ecx), %ymm3, %ymm2 +# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [ecx + 4064] +0xc4,0xe2,0x64,0xd2,0x91,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwuud -4096(%edx), %ymm3, %ymm2 +# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [edx - 4096] +0xc4,0xe2,0x64,0xd2,0x92,0x00,0xf0,0xff,0xff + +# ATT: vpdpwuud 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuud 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x60,0xd2,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuud (%eax), %xmm3, %xmm2 +# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x60,0xd2,0x10 + +# ATT: vpdpwuud -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwuud 2032(%ecx), %xmm3, %xmm2 +# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [ecx + 2032] +0xc4,0xe2,0x60,0xd2,0x91,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwuud -2048(%edx), %xmm3, %xmm2 +# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [edx - 2048] +0xc4,0xe2,0x60,0xd2,0x92,0x00,0xf8,0xff,0xff + +# ATT: vpdpwuuds %ymm4, %ymm3, %ymm2 +# INTEL: vpdpwuuds ymm2, ymm3, ymm4 +0xc4,0xe2,0x64,0xd3,0xd4 + +# ATT: vpdpwuuds %xmm4, %xmm3, %xmm2 +# INTEL: vpdpwuuds xmm2, xmm3, xmm4 +0xc4,0xe2,0x60,0xd3,0xd4 + +# ATT: vpdpwuuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuuds 291(%edi,%eax,4), %ymm3, %ymm2 +# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x64,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuuds (%eax), %ymm3, %ymm2 +# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [eax] +0xc4,0xe2,0x64,0xd3,0x10 + +# ATT: vpdpwuuds -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwuuds 4064(%ecx), %ymm3, %ymm2 +# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [ecx + 4064] +0xc4,0xe2,0x64,0xd3,0x91,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwuuds -4096(%edx), %ymm3, %ymm2 +# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [edx - 4096] +0xc4,0xe2,0x64,0xd3,0x92,0x00,0xf0,0xff,0xff + +# ATT: vpdpwuuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuuds 291(%edi,%eax,4), %xmm3, %xmm2 +# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x60,0xd3,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuuds (%eax), %xmm3, %xmm2 +# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [eax] +0xc4,0xe2,0x60,0xd3,0x10 + +# ATT: vpdpwuuds -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwuuds 2032(%ecx), %xmm3, %xmm2 +# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [ecx + 2032] +0xc4,0xe2,0x60,0xd3,0x91,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwuuds -2048(%edx), %xmm3, %xmm2 +# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [edx - 2048] +0xc4,0xe2,0x60,0xd3,0x92,0x00,0xf8,0xff,0xff + diff --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int16-64.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-64.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-64.txt @@ -0,0 +1,339 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vpdpwsud %ymm4, %ymm13, %ymm12 +# INTEL: vpdpwsud ymm12, ymm13, ymm4 +0xc4,0x62,0x16,0xd2,0xe4 + +# ATT: vpdpwsud %xmm4, %xmm13, %xmm12 +# INTEL: vpdpwsud xmm12, xmm13, xmm4 +0xc4,0x62,0x12,0xd2,0xe4 + +# ATT: vpdpwsud 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x16,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsud 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x16,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsud (%rip), %ymm13, %ymm12 +# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x16,0xd2,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwsud -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x16,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwsud 4064(%rcx), %ymm13, %ymm12 +# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rcx + 4064] +0xc4,0x62,0x16,0xd2,0xa1,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwsud -4096(%rdx), %ymm13, %ymm12 +# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rdx - 4096] +0xc4,0x62,0x16,0xd2,0xa2,0x00,0xf0,0xff,0xff + +# ATT: vpdpwsud 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x12,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsud 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x12,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsud (%rip), %xmm13, %xmm12 +# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x12,0xd2,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwsud -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x12,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwsud 2032(%rcx), %xmm13, %xmm12 +# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rcx + 2032] +0xc4,0x62,0x12,0xd2,0xa1,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwsud -2048(%rdx), %xmm13, %xmm12 +# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rdx - 2048] +0xc4,0x62,0x12,0xd2,0xa2,0x00,0xf8,0xff,0xff + +# ATT: vpdpwsuds %ymm4, %ymm13, %ymm12 +# INTEL: vpdpwsuds ymm12, ymm13, ymm4 +0xc4,0x62,0x16,0xd3,0xe4 + +# ATT: vpdpwsuds %xmm4, %xmm13, %xmm12 +# INTEL: vpdpwsuds xmm12, xmm13, xmm4 +0xc4,0x62,0x12,0xd3,0xe4 + +# ATT: vpdpwsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x16,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsuds 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x16,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsuds (%rip), %ymm13, %ymm12 +# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x16,0xd3,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwsuds -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x16,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwsuds 4064(%rcx), %ymm13, %ymm12 +# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rcx + 4064] +0xc4,0x62,0x16,0xd3,0xa1,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwsuds -4096(%rdx), %ymm13, %ymm12 +# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rdx - 4096] +0xc4,0x62,0x16,0xd3,0xa2,0x00,0xf0,0xff,0xff + +# ATT: vpdpwsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x12,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwsuds 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x12,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwsuds (%rip), %xmm13, %xmm12 +# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x12,0xd3,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwsuds -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x12,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwsuds 2032(%rcx), %xmm13, %xmm12 +# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rcx + 2032] +0xc4,0x62,0x12,0xd3,0xa1,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwsuds -2048(%rdx), %xmm13, %xmm12 +# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rdx - 2048] +0xc4,0x62,0x12,0xd3,0xa2,0x00,0xf8,0xff,0xff + +# ATT: vpdpwusd %ymm4, %ymm13, %ymm12 +# INTEL: vpdpwusd ymm12, ymm13, ymm4 +0xc4,0x62,0x15,0xd2,0xe4 + +# ATT: vpdpwusd %xmm4, %xmm13, %xmm12 +# INTEL: vpdpwusd xmm12, xmm13, xmm4 +0xc4,0x62,0x11,0xd2,0xe4 + +# ATT: vpdpwusd 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x15,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusd 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x15,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusd (%rip), %ymm13, %ymm12 +# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x15,0xd2,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwusd -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x15,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwusd 4064(%rcx), %ymm13, %ymm12 +# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rcx + 4064] +0xc4,0x62,0x15,0xd2,0xa1,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwusd -4096(%rdx), %ymm13, %ymm12 +# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rdx - 4096] +0xc4,0x62,0x15,0xd2,0xa2,0x00,0xf0,0xff,0xff + +# ATT: vpdpwusd 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x11,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusd 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x11,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusd (%rip), %xmm13, %xmm12 +# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x11,0xd2,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwusd -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x11,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwusd 2032(%rcx), %xmm13, %xmm12 +# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rcx + 2032] +0xc4,0x62,0x11,0xd2,0xa1,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwusd -2048(%rdx), %xmm13, %xmm12 +# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rdx - 2048] +0xc4,0x62,0x11,0xd2,0xa2,0x00,0xf8,0xff,0xff + +# ATT: vpdpwusds %ymm4, %ymm13, %ymm12 +# INTEL: vpdpwusds ymm12, ymm13, ymm4 +0xc4,0x62,0x15,0xd3,0xe4 + +# ATT: vpdpwusds %xmm4, %xmm13, %xmm12 +# INTEL: vpdpwusds xmm12, xmm13, xmm4 +0xc4,0x62,0x11,0xd3,0xe4 + +# ATT: vpdpwusds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x15,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusds 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x15,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusds (%rip), %ymm13, %ymm12 +# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x15,0xd3,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwusds -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x15,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwusds 4064(%rcx), %ymm13, %ymm12 +# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rcx + 4064] +0xc4,0x62,0x15,0xd3,0xa1,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwusds -4096(%rdx), %ymm13, %ymm12 +# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rdx - 4096] +0xc4,0x62,0x15,0xd3,0xa2,0x00,0xf0,0xff,0xff + +# ATT: vpdpwusds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x11,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwusds 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x11,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwusds (%rip), %xmm13, %xmm12 +# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x11,0xd3,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwusds -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x11,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwusds 2032(%rcx), %xmm13, %xmm12 +# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rcx + 2032] +0xc4,0x62,0x11,0xd3,0xa1,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwusds -2048(%rdx), %xmm13, %xmm12 +# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rdx - 2048] +0xc4,0x62,0x11,0xd3,0xa2,0x00,0xf8,0xff,0xff + +# ATT: vpdpwuud %ymm4, %ymm13, %ymm12 +# INTEL: vpdpwuud ymm12, ymm13, ymm4 +0xc4,0x62,0x14,0xd2,0xe4 + +# ATT: vpdpwuud %xmm4, %xmm13, %xmm12 +# INTEL: vpdpwuud xmm12, xmm13, xmm4 +0xc4,0x62,0x10,0xd2,0xe4 + +# ATT: vpdpwuud 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x14,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuud 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x14,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuud (%rip), %ymm13, %ymm12 +# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x14,0xd2,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwuud -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x14,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwuud 4064(%rcx), %ymm13, %ymm12 +# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rcx + 4064] +0xc4,0x62,0x14,0xd2,0xa1,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwuud -4096(%rdx), %ymm13, %ymm12 +# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rdx - 4096] +0xc4,0x62,0x14,0xd2,0xa2,0x00,0xf0,0xff,0xff + +# ATT: vpdpwuud 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x10,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuud 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x10,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuud (%rip), %xmm13, %xmm12 +# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x10,0xd2,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwuud -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x10,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwuud 2032(%rcx), %xmm13, %xmm12 +# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rcx + 2032] +0xc4,0x62,0x10,0xd2,0xa1,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwuud -2048(%rdx), %xmm13, %xmm12 +# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rdx - 2048] +0xc4,0x62,0x10,0xd2,0xa2,0x00,0xf8,0xff,0xff + +# ATT: vpdpwuuds %ymm4, %ymm13, %ymm12 +# INTEL: vpdpwuuds ymm12, ymm13, ymm4 +0xc4,0x62,0x14,0xd3,0xe4 + +# ATT: vpdpwuuds %xmm4, %xmm13, %xmm12 +# INTEL: vpdpwuuds xmm12, xmm13, xmm4 +0xc4,0x62,0x10,0xd3,0xe4 + +# ATT: vpdpwuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x14,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuuds 291(%r8,%rax,4), %ymm13, %ymm12 +# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x14,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuuds (%rip), %ymm13, %ymm12 +# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rip] +0xc4,0x62,0x14,0xd3,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwuuds -1024(,%rbp,2), %ymm13, %ymm12 +# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +0xc4,0x62,0x14,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vpdpwuuds 4064(%rcx), %ymm13, %ymm12 +# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rcx + 4064] +0xc4,0x62,0x14,0xd3,0xa1,0xe0,0x0f,0x00,0x00 + +# ATT: vpdpwuuds -4096(%rdx), %ymm13, %ymm12 +# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rdx - 4096] +0xc4,0x62,0x14,0xd3,0xa2,0x00,0xf0,0xff,0xff + +# ATT: vpdpwuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0x22,0x10,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vpdpwuuds 291(%r8,%rax,4), %xmm13, %xmm12 +# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +0xc4,0x42,0x10,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vpdpwuuds (%rip), %xmm13, %xmm12 +# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rip] +0xc4,0x62,0x10,0xd3,0x25,0x00,0x00,0x00,0x00 + +# ATT: vpdpwuuds -512(,%rbp,2), %xmm13, %xmm12 +# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [2*rbp - 512] +0xc4,0x62,0x10,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vpdpwuuds 2032(%rcx), %xmm13, %xmm12 +# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rcx + 2032] +0xc4,0x62,0x10,0xd3,0xa1,0xf0,0x07,0x00,0x00 + +# ATT: vpdpwuuds -2048(%rdx), %xmm13, %xmm12 +# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rdx - 2048] +0xc4,0x62,0x10,0xd3,0xa2,0x00,0xf8,0xff,0xff + diff --git a/llvm/test/MC/X86/avx-vnni-int16-32-att.s b/llvm/test/MC/X86/avx-vnni-int16-32-att.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx-vnni-int16-32-att.s @@ -0,0 +1,338 @@ +// RUN: llvm-mc -triple i686-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: vpdpwsud %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0xd4] + vpdpwsud %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpwsud %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0xd4] + vpdpwsud %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpwsud 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsud 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpwsud 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsud 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpwsud (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x10] + vpdpwsud (%eax), %ymm3, %ymm2 + +// CHECK: vpdpwsud -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsud -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpwsud 4064(%ecx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x91,0xe0,0x0f,0x00,0x00] + vpdpwsud 4064(%ecx), %ymm3, %ymm2 + +// CHECK: vpdpwsud -4096(%edx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x92,0x00,0xf0,0xff,0xff] + vpdpwsud -4096(%edx), %ymm3, %ymm2 + +// CHECK: vpdpwsud 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsud 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpwsud 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsud 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpwsud (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x10] + vpdpwsud (%eax), %xmm3, %xmm2 + +// CHECK: vpdpwsud -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsud -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vpdpwsud 2032(%ecx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x91,0xf0,0x07,0x00,0x00] + vpdpwsud 2032(%ecx), %xmm3, %xmm2 + +// CHECK: vpdpwsud -2048(%edx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x92,0x00,0xf8,0xff,0xff] + vpdpwsud -2048(%edx), %xmm3, %xmm2 + +// CHECK: vpdpwsuds %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0xd4] + vpdpwsuds %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpwsuds %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0xd4] + vpdpwsuds %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpwsuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsuds 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpwsuds 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsuds 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpwsuds (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x10] + vpdpwsuds (%eax), %ymm3, %ymm2 + +// CHECK: vpdpwsuds -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsuds -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpwsuds 4064(%ecx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x91,0xe0,0x0f,0x00,0x00] + vpdpwsuds 4064(%ecx), %ymm3, %ymm2 + +// CHECK: vpdpwsuds -4096(%edx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x92,0x00,0xf0,0xff,0xff] + vpdpwsuds -4096(%edx), %ymm3, %ymm2 + +// CHECK: vpdpwsuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsuds 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpwsuds 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsuds 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpwsuds (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x10] + vpdpwsuds (%eax), %xmm3, %xmm2 + +// CHECK: vpdpwsuds -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsuds -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vpdpwsuds 2032(%ecx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x91,0xf0,0x07,0x00,0x00] + vpdpwsuds 2032(%ecx), %xmm3, %xmm2 + +// CHECK: vpdpwsuds -2048(%edx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x92,0x00,0xf8,0xff,0xff] + vpdpwsuds -2048(%edx), %xmm3, %xmm2 + +// CHECK: vpdpwusd %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0xd4] + vpdpwusd %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpwusd %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0xd4] + vpdpwusd %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpwusd 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusd 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpwusd 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusd 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpwusd (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x10] + vpdpwusd (%eax), %ymm3, %ymm2 + +// CHECK: vpdpwusd -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusd -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpwusd 4064(%ecx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x91,0xe0,0x0f,0x00,0x00] + vpdpwusd 4064(%ecx), %ymm3, %ymm2 + +// CHECK: vpdpwusd -4096(%edx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x92,0x00,0xf0,0xff,0xff] + vpdpwusd -4096(%edx), %ymm3, %ymm2 + +// CHECK: vpdpwusd 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusd 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpwusd 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusd 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpwusd (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x10] + vpdpwusd (%eax), %xmm3, %xmm2 + +// CHECK: vpdpwusd -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusd -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vpdpwusd 2032(%ecx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x91,0xf0,0x07,0x00,0x00] + vpdpwusd 2032(%ecx), %xmm3, %xmm2 + +// CHECK: vpdpwusd -2048(%edx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x92,0x00,0xf8,0xff,0xff] + vpdpwusd -2048(%edx), %xmm3, %xmm2 + +// CHECK: vpdpwusds %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0xd4] + vpdpwusds %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpwusds %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0xd4] + vpdpwusds %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpwusds 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusds 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpwusds 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusds 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpwusds (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x10] + vpdpwusds (%eax), %ymm3, %ymm2 + +// CHECK: vpdpwusds -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusds -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpwusds 4064(%ecx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x91,0xe0,0x0f,0x00,0x00] + vpdpwusds 4064(%ecx), %ymm3, %ymm2 + +// CHECK: vpdpwusds -4096(%edx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x92,0x00,0xf0,0xff,0xff] + vpdpwusds -4096(%edx), %ymm3, %ymm2 + +// CHECK: vpdpwusds 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusds 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpwusds 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusds 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpwusds (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x10] + vpdpwusds (%eax), %xmm3, %xmm2 + +// CHECK: vpdpwusds -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusds -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vpdpwusds 2032(%ecx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x91,0xf0,0x07,0x00,0x00] + vpdpwusds 2032(%ecx), %xmm3, %xmm2 + +// CHECK: vpdpwusds -2048(%edx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x92,0x00,0xf8,0xff,0xff] + vpdpwusds -2048(%edx), %xmm3, %xmm2 + +// CHECK: vpdpwuud %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0xd4] + vpdpwuud %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpwuud %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0xd4] + vpdpwuud %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpwuud 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuud 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpwuud 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuud 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpwuud (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x10] + vpdpwuud (%eax), %ymm3, %ymm2 + +// CHECK: vpdpwuud -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuud -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpwuud 4064(%ecx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x91,0xe0,0x0f,0x00,0x00] + vpdpwuud 4064(%ecx), %ymm3, %ymm2 + +// CHECK: vpdpwuud -4096(%edx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x92,0x00,0xf0,0xff,0xff] + vpdpwuud -4096(%edx), %ymm3, %ymm2 + +// CHECK: vpdpwuud 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuud 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpwuud 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuud 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpwuud (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x10] + vpdpwuud (%eax), %xmm3, %xmm2 + +// CHECK: vpdpwuud -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuud -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vpdpwuud 2032(%ecx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x91,0xf0,0x07,0x00,0x00] + vpdpwuud 2032(%ecx), %xmm3, %xmm2 + +// CHECK: vpdpwuud -2048(%edx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x92,0x00,0xf8,0xff,0xff] + vpdpwuud -2048(%edx), %xmm3, %xmm2 + +// CHECK: vpdpwuuds %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0xd4] + vpdpwuuds %ymm4, %ymm3, %ymm2 + +// CHECK: vpdpwuuds %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0xd4] + vpdpwuuds %xmm4, %xmm3, %xmm2 + +// CHECK: vpdpwuuds 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuuds 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vpdpwuuds 291(%edi,%eax,4), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuuds 291(%edi,%eax,4), %ymm3, %ymm2 + +// CHECK: vpdpwuuds (%eax), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x10] + vpdpwuuds (%eax), %ymm3, %ymm2 + +// CHECK: vpdpwuuds -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuuds -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vpdpwuuds 4064(%ecx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x91,0xe0,0x0f,0x00,0x00] + vpdpwuuds 4064(%ecx), %ymm3, %ymm2 + +// CHECK: vpdpwuuds -4096(%edx), %ymm3, %ymm2 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x92,0x00,0xf0,0xff,0xff] + vpdpwuuds -4096(%edx), %ymm3, %ymm2 + +// CHECK: vpdpwuuds 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuuds 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vpdpwuuds 291(%edi,%eax,4), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuuds 291(%edi,%eax,4), %xmm3, %xmm2 + +// CHECK: vpdpwuuds (%eax), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x10] + vpdpwuuds (%eax), %xmm3, %xmm2 + +// CHECK: vpdpwuuds -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuuds -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vpdpwuuds 2032(%ecx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x91,0xf0,0x07,0x00,0x00] + vpdpwuuds 2032(%ecx), %xmm3, %xmm2 + +// CHECK: vpdpwuuds -2048(%edx), %xmm3, %xmm2 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x92,0x00,0xf8,0xff,0xff] + vpdpwuuds -2048(%edx), %xmm3, %xmm2 + diff --git a/llvm/test/MC/X86/avx-vnni-int16-32-intel.s b/llvm/test/MC/X86/avx-vnni-int16-32-intel.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx-vnni-int16-32-intel.s @@ -0,0 +1,338 @@ +// RUN: llvm-mc -triple i686-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vpdpwsud ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0xd4] + vpdpwsud ymm2, ymm3, ymm4 + +// CHECK: vpdpwsud xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0xd4] + vpdpwsud xmm2, xmm3, xmm4 + +// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x10] + vpdpwsud ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x91,0xe0,0x0f,0x00,0x00] + vpdpwsud ymm2, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x92,0x00,0xf0,0xff,0xff] + vpdpwsud ymm2, ymm3, ymmword ptr [edx - 4096] + +// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x10] + vpdpwsud xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x91,0xf0,0x07,0x00,0x00] + vpdpwsud xmm2, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x92,0x00,0xf8,0xff,0xff] + vpdpwsud xmm2, xmm3, xmmword ptr [edx - 2048] + +// CHECK: vpdpwsuds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0xd4] + vpdpwsuds ymm2, ymm3, ymm4 + +// CHECK: vpdpwsuds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0xd4] + vpdpwsuds xmm2, xmm3, xmm4 + +// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x10] + vpdpwsuds ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x91,0xe0,0x0f,0x00,0x00] + vpdpwsuds ymm2, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x92,0x00,0xf0,0xff,0xff] + vpdpwsuds ymm2, ymm3, ymmword ptr [edx - 4096] + +// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x10] + vpdpwsuds xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x91,0xf0,0x07,0x00,0x00] + vpdpwsuds xmm2, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x92,0x00,0xf8,0xff,0xff] + vpdpwsuds xmm2, xmm3, xmmword ptr [edx - 2048] + +// CHECK: vpdpwusd ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0xd4] + vpdpwusd ymm2, ymm3, ymm4 + +// CHECK: vpdpwusd xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0xd4] + vpdpwusd xmm2, xmm3, xmm4 + +// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x10] + vpdpwusd ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x91,0xe0,0x0f,0x00,0x00] + vpdpwusd ymm2, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x92,0x00,0xf0,0xff,0xff] + vpdpwusd ymm2, ymm3, ymmword ptr [edx - 4096] + +// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x10] + vpdpwusd xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x91,0xf0,0x07,0x00,0x00] + vpdpwusd xmm2, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x92,0x00,0xf8,0xff,0xff] + vpdpwusd xmm2, xmm3, xmmword ptr [edx - 2048] + +// CHECK: vpdpwusds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0xd4] + vpdpwusds ymm2, ymm3, ymm4 + +// CHECK: vpdpwusds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0xd4] + vpdpwusds xmm2, xmm3, xmm4 + +// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x10] + vpdpwusds ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x91,0xe0,0x0f,0x00,0x00] + vpdpwusds ymm2, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x92,0x00,0xf0,0xff,0xff] + vpdpwusds ymm2, ymm3, ymmword ptr [edx - 4096] + +// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwusds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x10] + vpdpwusds xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x91,0xf0,0x07,0x00,0x00] + vpdpwusds xmm2, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x92,0x00,0xf8,0xff,0xff] + vpdpwusds xmm2, xmm3, xmmword ptr [edx - 2048] + +// CHECK: vpdpwuud ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0xd4] + vpdpwuud ymm2, ymm3, ymm4 + +// CHECK: vpdpwuud xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0xd4] + vpdpwuud xmm2, xmm3, xmm4 + +// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x10] + vpdpwuud ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x91,0xe0,0x0f,0x00,0x00] + vpdpwuud ymm2, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x92,0x00,0xf0,0xff,0xff] + vpdpwuud ymm2, ymm3, ymmword ptr [edx - 4096] + +// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x10] + vpdpwuud xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x91,0xf0,0x07,0x00,0x00] + vpdpwuud xmm2, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x92,0x00,0xf8,0xff,0xff] + vpdpwuud xmm2, xmm3, xmmword ptr [edx - 2048] + +// CHECK: vpdpwuuds ymm2, ymm3, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0xd4] + vpdpwuuds ymm2, ymm3, ymm4 + +// CHECK: vpdpwuuds xmm2, xmm3, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0xd4] + vpdpwuuds xmm2, xmm3, xmm4 + +// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x10] + vpdpwuuds ymm2, ymm3, ymmword ptr [eax] + +// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x91,0xe0,0x0f,0x00,0x00] + vpdpwuuds ymm2, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x92,0x00,0xf0,0xff,0xff] + vpdpwuuds ymm2, ymm3, ymmword ptr [edx - 4096] + +// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10] + vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0x87,0x23,0x01,0x00,0x00] + vpdpwuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x10] + vpdpwuuds xmm2, xmm3, xmmword ptr [eax] + +// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x91,0xf0,0x07,0x00,0x00] + vpdpwuuds xmm2, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x92,0x00,0xf8,0xff,0xff] + vpdpwuuds xmm2, xmm3, xmmword ptr [edx - 2048] + diff --git a/llvm/test/MC/X86/avx-vnni-int16-64-att.s b/llvm/test/MC/X86/avx-vnni-int16-64-att.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx-vnni-int16-64-att.s @@ -0,0 +1,338 @@ +// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s + +// CHECK: vpdpwsud %ymm4, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xe4] + vpdpwsud %ymm4, %ymm13, %ymm12 + +// CHECK: vpdpwsud %xmm4, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xe4] + vpdpwsud %xmm4, %xmm13, %xmm12 + +// CHECK: vpdpwsud 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x16,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsud 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpwsud 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x16,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwsud 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpwsud (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwsud (%rip), %ymm13, %ymm12 + +// CHECK: vpdpwsud -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsud -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpwsud 4064(%rcx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwsud 4064(%rcx), %ymm13, %ymm12 + +// CHECK: vpdpwsud -4096(%rdx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa2,0x00,0xf0,0xff,0xff] + vpdpwsud -4096(%rdx), %ymm13, %ymm12 + +// CHECK: vpdpwsud 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x12,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsud 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpwsud 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x12,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwsud 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpwsud (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwsud (%rip), %xmm13, %xmm12 + +// CHECK: vpdpwsud -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsud -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: vpdpwsud 2032(%rcx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa1,0xf0,0x07,0x00,0x00] + vpdpwsud 2032(%rcx), %xmm13, %xmm12 + +// CHECK: vpdpwsud -2048(%rdx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa2,0x00,0xf8,0xff,0xff] + vpdpwsud -2048(%rdx), %xmm13, %xmm12 + +// CHECK: vpdpwsuds %ymm4, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xe4] + vpdpwsuds %ymm4, %ymm13, %ymm12 + +// CHECK: vpdpwsuds %xmm4, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xe4] + vpdpwsuds %xmm4, %xmm13, %xmm12 + +// CHECK: vpdpwsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x16,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpwsuds 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x16,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwsuds 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpwsuds (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwsuds (%rip), %ymm13, %ymm12 + +// CHECK: vpdpwsuds -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsuds -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpwsuds 4064(%rcx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwsuds 4064(%rcx), %ymm13, %ymm12 + +// CHECK: vpdpwsuds -4096(%rdx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa2,0x00,0xf0,0xff,0xff] + vpdpwsuds -4096(%rdx), %ymm13, %ymm12 + +// CHECK: vpdpwsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x12,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpwsuds 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x12,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwsuds 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpwsuds (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwsuds (%rip), %xmm13, %xmm12 + +// CHECK: vpdpwsuds -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsuds -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: vpdpwsuds 2032(%rcx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa1,0xf0,0x07,0x00,0x00] + vpdpwsuds 2032(%rcx), %xmm13, %xmm12 + +// CHECK: vpdpwsuds -2048(%rdx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa2,0x00,0xf8,0xff,0xff] + vpdpwsuds -2048(%rdx), %xmm13, %xmm12 + +// CHECK: vpdpwusd %ymm4, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xe4] + vpdpwusd %ymm4, %ymm13, %ymm12 + +// CHECK: vpdpwusd %xmm4, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xe4] + vpdpwusd %xmm4, %xmm13, %xmm12 + +// CHECK: vpdpwusd 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x15,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusd 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpwusd 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x15,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwusd 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpwusd (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwusd (%rip), %ymm13, %ymm12 + +// CHECK: vpdpwusd -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusd -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpwusd 4064(%rcx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwusd 4064(%rcx), %ymm13, %ymm12 + +// CHECK: vpdpwusd -4096(%rdx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa2,0x00,0xf0,0xff,0xff] + vpdpwusd -4096(%rdx), %ymm13, %ymm12 + +// CHECK: vpdpwusd 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x11,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusd 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpwusd 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x11,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwusd 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpwusd (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwusd (%rip), %xmm13, %xmm12 + +// CHECK: vpdpwusd -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusd -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: vpdpwusd 2032(%rcx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa1,0xf0,0x07,0x00,0x00] + vpdpwusd 2032(%rcx), %xmm13, %xmm12 + +// CHECK: vpdpwusd -2048(%rdx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa2,0x00,0xf8,0xff,0xff] + vpdpwusd -2048(%rdx), %xmm13, %xmm12 + +// CHECK: vpdpwusds %ymm4, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xe4] + vpdpwusds %ymm4, %ymm13, %ymm12 + +// CHECK: vpdpwusds %xmm4, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xe4] + vpdpwusds %xmm4, %xmm13, %xmm12 + +// CHECK: vpdpwusds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x15,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusds 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpwusds 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x15,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwusds 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpwusds (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwusds (%rip), %ymm13, %ymm12 + +// CHECK: vpdpwusds -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusds -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpwusds 4064(%rcx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwusds 4064(%rcx), %ymm13, %ymm12 + +// CHECK: vpdpwusds -4096(%rdx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa2,0x00,0xf0,0xff,0xff] + vpdpwusds -4096(%rdx), %ymm13, %ymm12 + +// CHECK: vpdpwusds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x11,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusds 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpwusds 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x11,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwusds 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpwusds (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwusds (%rip), %xmm13, %xmm12 + +// CHECK: vpdpwusds -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusds -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: vpdpwusds 2032(%rcx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa1,0xf0,0x07,0x00,0x00] + vpdpwusds 2032(%rcx), %xmm13, %xmm12 + +// CHECK: vpdpwusds -2048(%rdx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa2,0x00,0xf8,0xff,0xff] + vpdpwusds -2048(%rdx), %xmm13, %xmm12 + +// CHECK: vpdpwuud %ymm4, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xe4] + vpdpwuud %ymm4, %ymm13, %ymm12 + +// CHECK: vpdpwuud %xmm4, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xe4] + vpdpwuud %xmm4, %xmm13, %xmm12 + +// CHECK: vpdpwuud 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x14,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuud 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpwuud 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x14,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwuud 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpwuud (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwuud (%rip), %ymm13, %ymm12 + +// CHECK: vpdpwuud -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuud -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpwuud 4064(%rcx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwuud 4064(%rcx), %ymm13, %ymm12 + +// CHECK: vpdpwuud -4096(%rdx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa2,0x00,0xf0,0xff,0xff] + vpdpwuud -4096(%rdx), %ymm13, %ymm12 + +// CHECK: vpdpwuud 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x10,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuud 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpwuud 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x10,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwuud 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpwuud (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwuud (%rip), %xmm13, %xmm12 + +// CHECK: vpdpwuud -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuud -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: vpdpwuud 2032(%rcx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa1,0xf0,0x07,0x00,0x00] + vpdpwuud 2032(%rcx), %xmm13, %xmm12 + +// CHECK: vpdpwuud -2048(%rdx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa2,0x00,0xf8,0xff,0xff] + vpdpwuud -2048(%rdx), %xmm13, %xmm12 + +// CHECK: vpdpwuuds %ymm4, %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xe4] + vpdpwuuds %ymm4, %ymm13, %ymm12 + +// CHECK: vpdpwuuds %xmm4, %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xe4] + vpdpwuuds %xmm4, %xmm13, %xmm12 + +// CHECK: vpdpwuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x22,0x14,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12 + +// CHECK: vpdpwuuds 291(%r8,%rax,4), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x42,0x14,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwuuds 291(%r8,%rax,4), %ymm13, %ymm12 + +// CHECK: vpdpwuuds (%rip), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwuuds (%rip), %ymm13, %ymm12 + +// CHECK: vpdpwuuds -1024(,%rbp,2), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuuds -1024(,%rbp,2), %ymm13, %ymm12 + +// CHECK: vpdpwuuds 4064(%rcx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwuuds 4064(%rcx), %ymm13, %ymm12 + +// CHECK: vpdpwuuds -4096(%rdx), %ymm13, %ymm12 +// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa2,0x00,0xf0,0xff,0xff] + vpdpwuuds -4096(%rdx), %ymm13, %ymm12 + +// CHECK: vpdpwuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x22,0x10,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12 + +// CHECK: vpdpwuuds 291(%r8,%rax,4), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x42,0x10,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwuuds 291(%r8,%rax,4), %xmm13, %xmm12 + +// CHECK: vpdpwuuds (%rip), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwuuds (%rip), %xmm13, %xmm12 + +// CHECK: vpdpwuuds -512(,%rbp,2), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuuds -512(,%rbp,2), %xmm13, %xmm12 + +// CHECK: vpdpwuuds 2032(%rcx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa1,0xf0,0x07,0x00,0x00] + vpdpwuuds 2032(%rcx), %xmm13, %xmm12 + +// CHECK: vpdpwuuds -2048(%rdx), %xmm13, %xmm12 +// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa2,0x00,0xf8,0xff,0xff] + vpdpwuuds -2048(%rdx), %xmm13, %xmm12 + diff --git a/llvm/test/MC/X86/avx-vnni-int16-64-intel.s b/llvm/test/MC/X86/avx-vnni-int16-64-intel.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx-vnni-int16-64-intel.s @@ -0,0 +1,338 @@ +// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vpdpwsud ymm12, ymm13, ymm4 +// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xe4] + vpdpwsud ymm12, ymm13, ymm4 + +// CHECK: vpdpwsud xmm12, xmm13, xmm4 +// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xe4] + vpdpwsud xmm12, xmm13, xmm4 + +// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x16,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x16,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwsud ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsud ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwsud ymm12, ymm13, ymmword ptr [rcx + 4064] + +// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa2,0x00,0xf0,0xff,0xff] + vpdpwsud ymm12, ymm13, ymmword ptr [rdx - 4096] + +// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x12,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x12,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwsud xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsud xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa1,0xf0,0x07,0x00,0x00] + vpdpwsud xmm12, xmm13, xmmword ptr [rcx + 2032] + +// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa2,0x00,0xf8,0xff,0xff] + vpdpwsud xmm12, xmm13, xmmword ptr [rdx - 2048] + +// CHECK: vpdpwsuds ymm12, ymm13, ymm4 +// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xe4] + vpdpwsuds ymm12, ymm13, ymm4 + +// CHECK: vpdpwsuds xmm12, xmm13, xmm4 +// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xe4] + vpdpwsuds xmm12, xmm13, xmm4 + +// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x16,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x16,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwsuds ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwsuds ymm12, ymm13, ymmword ptr [rcx + 4064] + +// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa2,0x00,0xf0,0xff,0xff] + vpdpwsuds ymm12, ymm13, ymmword ptr [rdx - 4096] + +// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x12,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x12,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwsuds xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwsuds xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa1,0xf0,0x07,0x00,0x00] + vpdpwsuds xmm12, xmm13, xmmword ptr [rcx + 2032] + +// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa2,0x00,0xf8,0xff,0xff] + vpdpwsuds xmm12, xmm13, xmmword ptr [rdx - 2048] + +// CHECK: vpdpwusd ymm12, ymm13, ymm4 +// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xe4] + vpdpwusd ymm12, ymm13, ymm4 + +// CHECK: vpdpwusd xmm12, xmm13, xmm4 +// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xe4] + vpdpwusd xmm12, xmm13, xmm4 + +// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x15,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x15,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwusd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwusd ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusd ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwusd ymm12, ymm13, ymmword ptr [rcx + 4064] + +// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa2,0x00,0xf0,0xff,0xff] + vpdpwusd ymm12, ymm13, ymmword ptr [rdx - 4096] + +// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x11,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x11,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwusd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwusd xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusd xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa1,0xf0,0x07,0x00,0x00] + vpdpwusd xmm12, xmm13, xmmword ptr [rcx + 2032] + +// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa2,0x00,0xf8,0xff,0xff] + vpdpwusd xmm12, xmm13, xmmword ptr [rdx - 2048] + +// CHECK: vpdpwusds ymm12, ymm13, ymm4 +// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xe4] + vpdpwusds ymm12, ymm13, ymm4 + +// CHECK: vpdpwusds xmm12, xmm13, xmm4 +// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xe4] + vpdpwusds xmm12, xmm13, xmm4 + +// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x15,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x15,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwusds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwusds ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwusds ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwusds ymm12, ymm13, ymmword ptr [rcx + 4064] + +// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa2,0x00,0xf0,0xff,0xff] + vpdpwusds ymm12, ymm13, ymmword ptr [rdx - 4096] + +// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x11,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwusds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x11,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwusds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwusds xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwusds xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa1,0xf0,0x07,0x00,0x00] + vpdpwusds xmm12, xmm13, xmmword ptr [rcx + 2032] + +// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa2,0x00,0xf8,0xff,0xff] + vpdpwusds xmm12, xmm13, xmmword ptr [rdx - 2048] + +// CHECK: vpdpwuud ymm12, ymm13, ymm4 +// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xe4] + vpdpwuud ymm12, ymm13, ymm4 + +// CHECK: vpdpwuud xmm12, xmm13, xmm4 +// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xe4] + vpdpwuud xmm12, xmm13, xmm4 + +// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x14,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x14,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwuud ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuud ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwuud ymm12, ymm13, ymmword ptr [rcx + 4064] + +// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa2,0x00,0xf0,0xff,0xff] + vpdpwuud ymm12, ymm13, ymmword ptr [rdx - 4096] + +// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x10,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x10,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x25,0x00,0x00,0x00,0x00] + vpdpwuud xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuud xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa1,0xf0,0x07,0x00,0x00] + vpdpwuud xmm12, xmm13, xmmword ptr [rcx + 2032] + +// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa2,0x00,0xf8,0xff,0xff] + vpdpwuud xmm12, xmm13, xmmword ptr [rdx - 2048] + +// CHECK: vpdpwuuds ymm12, ymm13, ymm4 +// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xe4] + vpdpwuuds ymm12, ymm13, ymm4 + +// CHECK: vpdpwuuds xmm12, xmm13, xmm4 +// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xe4] + vpdpwuuds xmm12, xmm13, xmm4 + +// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x14,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x14,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwuuds ymm12, ymm13, ymmword ptr [rip] + +// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff] + vpdpwuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024] + +// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa1,0xe0,0x0f,0x00,0x00] + vpdpwuuds ymm12, ymm13, ymmword ptr [rcx + 4064] + +// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa2,0x00,0xf0,0xff,0xff] + vpdpwuuds ymm12, ymm13, ymmword ptr [rdx - 4096] + +// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0x22,0x10,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10] + vpdpwuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0x42,0x10,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00] + vpdpwuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x25,0x00,0x00,0x00,0x00] + vpdpwuuds xmm12, xmm13, xmmword ptr [rip] + +// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff] + vpdpwuuds xmm12, xmm13, xmmword ptr [2*rbp - 512] + +// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa1,0xf0,0x07,0x00,0x00] + vpdpwuuds xmm12, xmm13, xmmword ptr [rcx + 2032] + +// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa2,0x00,0xf8,0xff,0xff] + vpdpwuuds xmm12, xmm13, xmmword ptr [rdx - 2048] + diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -4236,6 +4236,18 @@ {X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256m, 0}, {X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0}, {X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0}, + {X86::VPDPWSUDSYrr, X86::VPDPWSUDSYrm, 0}, + {X86::VPDPWSUDSrr, X86::VPDPWSUDSrm, 0}, + {X86::VPDPWSUDYrr, X86::VPDPWSUDYrm, 0}, + {X86::VPDPWSUDrr, X86::VPDPWSUDrm, 0}, + {X86::VPDPWUSDSYrr, X86::VPDPWUSDSYrm, 0}, + {X86::VPDPWUSDSrr, X86::VPDPWUSDSrm, 0}, + {X86::VPDPWUSDYrr, X86::VPDPWUSDYrm, 0}, + {X86::VPDPWUSDrr, X86::VPDPWUSDrm, 0}, + {X86::VPDPWUUDSYrr, X86::VPDPWUUDSYrm, 0}, + {X86::VPDPWUUDSrr, X86::VPDPWUUDSrm, 0}, + {X86::VPDPWUUDYrr, X86::VPDPWUUDYrm, 0}, + {X86::VPDPWUUDrr, X86::VPDPWUUDrm, 0}, {X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0}, {X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0}, {X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0},