diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst --- a/clang/docs/ClangCommandLineReference.rst +++ b/clang/docs/ClangCommandLineReference.rst @@ -3253,6 +3253,8 @@ .. option:: -mavx512vpopcntdq, -mno-avx512vpopcntdq +.. option:: -mavxvnni, -mno-avxvnni + .. option:: -mbmi, -mno-bmi .. option:: -mbmi2, -mno-bmi2 diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -205,6 +205,8 @@ - Support for ``UINTR`` instructions has been added. +- Support for ``AVXVNNI`` instructions has been added. + Internal API Changes -------------------- diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -960,17 +960,17 @@ TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni") -TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni") +TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni") +TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni") TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni") -TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni") -TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni") +TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni") +TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni") TARGET_BUILTIN(__builtin_ia32_vpdpbusds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni") -TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni") -TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni") +TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni") +TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni") TARGET_BUILTIN(__builtin_ia32_vpdpwssd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni") -TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni") -TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni") +TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni") +TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni") TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni") TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3235,6 +3235,8 @@ def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group; def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group; def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group; +def mavxvnni : Flag<["-"], "mavxvnni">, Group; +def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group; def madx : Flag<["-"], "madx">, Group; def mno_adx : Flag<["-"], "mno-adx">, Group; def maes : Flag<["-"], "maes">, Group; diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -130,6 +130,7 @@ bool HasKL = false; // For key locker bool HasWIDEKL = false; // For wide key locker bool HasHRESET = false; + bool HasAVXVNNI = false; bool HasAMXTILE = false; bool HasAMXINT8 = false; bool HasAMXBF16 = false; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -306,6 +306,8 @@ HasAMXINT8 = true; } else if (Feature == "+amx-tile") { HasAMXTILE = true; + } else if (Feature == "+avxvnni") { + HasAVXVNNI = true; } else if (Feature == "+serialize") { HasSERIALIZE = true; } else if (Feature == "+tsxldtrk") { @@ -728,6 +730,8 @@ Builder.defineMacro("__AMXINT8__"); if (HasAMXBF16) Builder.defineMacro("__AMXBF16__"); + if (HasAVXVNNI) + Builder.defineMacro("__AVXVNNI__"); if (HasSERIALIZE) Builder.defineMacro("__SERIALIZE__"); if (HasTSXLDTRK) @@ -846,6 +850,7 @@ .Case("avx512vbmi2", true) .Case("avx512ifma", true) .Case("avx512vp2intersect", true) + .Case("avxvnni", true) .Case("bmi", true) .Case("bmi2", true) .Case("cldemote", true) @@ -918,6 +923,7 @@ .Case("amx-bf16", HasAMXBF16) .Case("amx-int8", HasAMXINT8) .Case("amx-tile", HasAMXTILE) + .Case("avxvnni", HasAVXVNNI) .Case("avx", SSELevel >= AVX) .Case("avx2", SSELevel >= AVX2) .Case("avx512f", SSELevel >= AVX512F) diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -35,6 +35,7 @@ avx512vnniintrin.h avx512vlvnniintrin.h avxintrin.h + avxvnniintrin.h bmi2intrin.h bmiintrin.h __clang_cuda_builtin_vars.h diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h --- a/clang/lib/Headers/avx512vlvnniintrin.h +++ b/clang/lib/Headers/avx512vlvnniintrin.h @@ -18,13 +18,157 @@ #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128))) #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256))) +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpbusd_epi32(S, A, B) \ + (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)) -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, - (__v8si)__B); -} +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpbusds_epi32(S, A, B) \ + (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpwssd_epi32(S, A, B) \ + (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpwssds_epi32(S, A, B) \ + (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)) + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpbusd_epi32(S, A, B) \ + (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)) + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpbusds_epi32(S, A, B) \ + (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpwssd_epi32(S, A, B) \ + (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpwssds_epi32(S, A, B) \ + (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) @@ -42,13 +186,6 @@ (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, - (__v8si)__B); -} - static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { @@ -65,13 +202,6 @@ (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, - (__v8si)__B); -} - static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { @@ -88,13 +218,6 @@ (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, - (__v8si)__B); -} - static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { @@ -111,13 +234,6 @@ (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, - (__v4si)__B); -} - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { @@ -134,13 +250,6 @@ (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, - (__v4si)__B); -} - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { @@ -157,13 +266,6 @@ (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, - (__v4si)__B); -} - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { @@ -180,13 +282,6 @@ (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, - (__v4si)__B); -} - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/avxvnniintrin.h @@ -0,0 +1,225 @@ +/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVXVNNIINTRIN_H +#define __AVXVNNIINTRIN_H + +/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */ +/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) +/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) +/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) +/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) + +/* Intrinsics with _avx_ prefix are for compatibility with msvc. */ +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128))) + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif // __AVXVNNIINTRIN_H diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h --- a/clang/lib/Headers/cpuid.h +++ b/clang/lib/Headers/cpuid.h @@ -196,6 +196,7 @@ #define bit_AMXINT8 0x02000000 /* Features in %eax for leaf 7 sub-leaf 1 */ +#define bit_AVXVNNI 0x00000008 #define bit_AVX512BF16 0x00000020 #define bit_HRESET 0x00400000 diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -145,6 +145,11 @@ #include #endif +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVXVNNI__) +#include +#endif + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__AVX512DQ__) #include diff --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/X86/avxvnni-builtins.c @@ -0,0 +1,99 @@ +// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror | FileCheck %s + +#include + +__m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) { + // CHECK-LABEL: @test_mm256_dpbusd_epi32 + // CHECK: @llvm.x86.avx512.vpdpbusd.256 + return _mm256_dpbusd_epi32(__S, __A, __B); +} + +__m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) { + // CHECK-LABEL: @test_mm256_dpbusds_epi32 + // CHECK: @llvm.x86.avx512.vpdpbusds.256 + return _mm256_dpbusds_epi32(__S, __A, __B); +} + +__m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) { + // CHECK-LABEL: @test_mm256_dpwssd_epi32 + // CHECK: @llvm.x86.avx512.vpdpwssd.256 + return _mm256_dpwssd_epi32(__S, __A, __B); +} + +__m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) { + // CHECK-LABEL: @test_mm256_dpwssds_epi32 + // CHECK: @llvm.x86.avx512.vpdpwssds.256 + return _mm256_dpwssds_epi32(__S, __A, __B); +} + +__m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) { + // CHECK-LABEL: @test_mm_dpbusd_epi32 + // CHECK: @llvm.x86.avx512.vpdpbusd.128 + return _mm_dpbusd_epi32(__S, __A, __B); +} + +__m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) { + // CHECK-LABEL: @test_mm_dpbusds_epi32 + // CHECK: @llvm.x86.avx512.vpdpbusds.128 + return _mm_dpbusds_epi32(__S, __A, __B); +} + +__m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) { + // CHECK-LABEL: @test_mm_dpwssd_epi32 + // CHECK: @llvm.x86.avx512.vpdpwssd.128 + return _mm_dpwssd_epi32(__S, __A, __B); +} + +__m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) { + // CHECK-LABEL: @test_mm_dpwssds_epi32 + // CHECK: @llvm.x86.avx512.vpdpwssds.128 + return _mm_dpwssds_epi32(__S, __A, __B); +} + +__m256i test_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { + // CHECK-LABEL: @test_mm256_dpbusd_avx_epi32 + // CHECK: @llvm.x86.avx512.vpdpbusd.256 + return _mm256_dpbusd_avx_epi32(__S, __A, __B); +} + +__m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { + // CHECK-LABEL: @test_mm256_dpbusds_avx_epi32 + // CHECK: @llvm.x86.avx512.vpdpbusds.256 + return _mm256_dpbusds_avx_epi32(__S, __A, __B); +} + +__m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { + // CHECK-LABEL: @test_mm256_dpwssd_avx_epi32 + // CHECK: @llvm.x86.avx512.vpdpwssd.256 + return _mm256_dpwssd_avx_epi32(__S, __A, __B); +} + +__m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { + // CHECK-LABEL: @test_mm256_dpwssds_avx_epi32 + // CHECK: @llvm.x86.avx512.vpdpwssds.256 + return _mm256_dpwssds_avx_epi32(__S, __A, __B); +} + +__m128i test_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { + // CHECK-LABEL: @test_mm_dpbusd_avx_epi32 + // CHECK: @llvm.x86.avx512.vpdpbusd.128 + return _mm_dpbusd_avx_epi32(__S, __A, __B); +} + +__m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { + // CHECK-LABEL: @test_mm_dpbusds_avx_epi32 + // CHECK: @llvm.x86.avx512.vpdpbusds.128 + return _mm_dpbusds_avx_epi32(__S, __A, __B); +} + +__m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { + // CHECK-LABEL: @test_mm_dpwssd_avx_epi32 + // CHECK: @llvm.x86.avx512.vpdpwssd.128 + return _mm_dpwssd_avx_epi32(__S, __A, __B); +} + +__m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { + // CHECK-LABEL: @test_mm_dpwssds_avx_epi32 + // CHECK: @llvm.x86.avx512.vpdpwssds.128 + return _mm_dpwssds_avx_epi32(__S, __A, __B); +} diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c --- a/clang/test/CodeGen/attr-target-x86.c +++ b/clang/test/CodeGen/attr-target-x86.c @@ -54,9 +54,9 @@ // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686" // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-NOT: tune-cpu -// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" +// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686" -// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" +// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes" // CHECK-NOT: tune-cpu // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx" diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -288,3 +288,8 @@ // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-UINTR %s // UINTR: "-target-feature" "+uintr" // NO-UINTR: "-target-feature" "-uintr" + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=AVX-VNNI %s +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AVX-VNNI %s +// AVX-VNNI: "-target-feature" "+avxvnni" +// NO-AVX-VNNI: "-target-feature" "-avxvnni" diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c --- a/clang/test/Preprocessor/predefined-arch-macros.c +++ b/clang/test/Preprocessor/predefined-arch-macros.c @@ -1654,6 +1654,7 @@ // CHECK_SPR_M32: #define __AVX512VL__ 1 // CHECK_SPR_M32: #define __AVX512VNNI__ 1 // CHECK_SPR_M32: #define __AVX512VPOPCNTDQ__ 1 +// CHECK_SPR_M32: #define __AVXVNNI__ 1 // CHECK_SPR_M32: #define __AVX__ 1 // CHECK_SPR_M32: #define __BMI2__ 1 // CHECK_SPR_M32: #define __BMI__ 1 @@ -1724,6 +1725,7 @@ // CHECK_SPR_M64: #define __AVX512VL__ 1 // CHECK_SPR_M64: #define __AVX512VNNI__ 1 // CHECK_SPR_M64: #define __AVX512VPOPCNTDQ__ 1 +// CHECK_SPR_M64: #define __AVXVNNI__ 1 // CHECK_SPR_M64: #define __AVX__ 1 // CHECK_SPR_M64: #define __BMI2__ 1 // CHECK_SPR_M64: #define __BMI__ 1 @@ -1782,6 +1784,7 @@ // CHECK_ADL_M32: #define __AES__ 1 // CHECK_ADL_M32: #define __AVX2__ 1 // CHECK_ADL_M32-NOT: AVX512 +// CHECK_ADL_M32: #define __AVXVNNI__ 1 // CHECK_ADL_M32: #define __AVX__ 1 // CHECK_ADL_M32: #define __BMI2__ 1 // CHECK_ADL_M32: #define __BMI__ 1 @@ -1822,6 +1825,7 @@ // CHECK_ADL_M64: #define __AES__ 1 // CHECK_ADL_M64: #define __AVX2__ 1 // CHECK_ADL_M64-NOT: AVX512 +// CHECK_ADL_M64: #define __AVXVNNI__ 1 // CHECK_ADL_M64: #define __AVX__ 1 // CHECK_ADL_M64: #define __BMI2__ 1 // CHECK_ADL_M64: #define __BMI__ 1 diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -544,3 +544,17 @@ // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr -x c -E -dM -o - %s | FileCheck -check-prefix=NOUINTR %s // NOUINTR-NOT: #define __UINTR__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s + +// AVXVNNI: #define __AVX2__ 1 +// AVXVNNI: #define __AVXVNNI__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mno-avxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOAVXVNNI %s + +// NOAVXVNNI-NOT: #define __AVXVNNI__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -mno-avx2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNINOAVX2 %s + +// AVXVNNINOAVX2-NOT: #define __AVX2__ 1 +// AVXVNNINOAVX2-NOT: #define __AVXVNNI__ 1 diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -121,6 +121,7 @@ the target CPU. * Support for ``HRESET`` instructions has been added. * Support for ``UINTR`` instructions has been added. +* Support for ``AVXVNNI`` instructions has been added. Changes to the AMDGPU Target ----------------------------- diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def --- a/llvm/include/llvm/Support/X86TargetParser.def +++ b/llvm/include/llvm/Support/X86TargetParser.def @@ -190,6 +190,7 @@ X86_FEATURE (XSAVEOPT, "xsaveopt") X86_FEATURE (XSAVES, "xsaves") X86_FEATURE (HRESET, "hreset") +X86_FEATURE (AVXVNNI, "avxvnni") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -1497,6 +1497,7 @@ Features["amx-int8"] = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave; bool HasLeaf7Subleaf1 = MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX); + Features["avxvnni"] = HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave; Features["avx512bf16"] = HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save; Features["hreset"] = HasLeaf7Subleaf1 && ((EAX >> 22) & 1); diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp --- a/llvm/lib/Support/X86TargetParser.cpp +++ b/llvm/lib/Support/X86TargetParser.cpp @@ -205,10 +205,10 @@ FeatureAVX512BF16 | FeatureAVX512VP2INTERSECT | FeatureCLDEMOTE | FeatureENQCMD | FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE | FeatureSERIALIZE | FeatureSHSTK | FeatureTSXLDTRK | FeatureUINTR | - FeatureWAITPKG; + FeatureWAITPKG | FeatureAVXVNNI; constexpr FeatureBitset FeaturesAlderlake = FeaturesSkylakeClient | FeatureCLDEMOTE | FeatureHRESET | FeaturePTWRITE | - FeatureSERIALIZE | FeatureWAITPKG; + FeatureSERIALIZE | FeatureWAITPKG | FeatureAVXVNNI; // Intel Atom processors. // Bonnell has feature parity with Core2 and adds MOVBE. @@ -575,6 +575,9 @@ constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2; constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL; +// AVXVNNI Features +constexpr FeatureBitset ImpliedFeaturesAVXVNNI = FeatureAVX2; + constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = { #define X86_FEATURE(ENUM, STR) {{STR}, ImpliedFeatures##ENUM}, #include "llvm/Support/X86TargetParser.def" diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -3845,6 +3845,13 @@ (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX) return Match_Unsupported; + // These instructions are only available with {vex}, {vex2} or {vex3} prefix + if (MCID.TSFlags & X86II::ExplicitVEXPrefix && + (ForcedVEXEncoding != VEXEncoding_VEX && + ForcedVEXEncoding != VEXEncoding_VEX2 && + ForcedVEXEncoding != VEXEncoding_VEX3)) + return Match_Unsupported; + // These instructions match ambiguously with their VEX encoded counterparts // and appear first in the matching table. Reject them unless we're forcing // EVEX encoding. diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h --- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -952,7 +952,11 @@ // NOTRACK prefix NoTrackShift = EVEX_RCShift + 1, - NOTRACK = 1ULL << NoTrackShift + NOTRACK = 1ULL << NoTrackShift, + + // Force VEX encoding + ExplicitVEXShift = NoTrackShift + 1, + ExplicitVEXPrefix = 1ULL << ExplicitVEXShift }; /// \returns true if the instruction with given opcode is a prefix. diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -348,7 +348,7 @@ O << "\trep\t"; // These all require a pseudo prefix - if (Flags & X86::IP_USE_VEX) + if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix)) O << "\t{vex}"; else if (Flags & X86::IP_USE_VEX2) O << "\t{vex2}"; diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -171,6 +171,9 @@ def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true", "Enable AVX-512 Vector Neural Network Instructions", [FeatureAVX512]>; +def FeatureAVXVNNI : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true", + "Support AVX_VNNI encoding", + [FeatureAVX2]>; def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true", "Support bfloat16 floating point", [FeatureBWI]>; @@ -769,6 +772,7 @@ FeatureCLDEMOTE, FeatureWAITPKG, FeaturePTWRITE, + FeatureAVXVNNI, FeatureTSXLDTRK, FeatureENQCMD, FeatureSHSTK, @@ -781,7 +785,8 @@ !listconcat(ICXFeatures, SPRAdditionalFeatures); // Alderlake - list ADLAdditionalFeatures = [FeatureCLDEMOTE, + list ADLAdditionalFeatures = [FeatureAVXVNNI, + FeatureCLDEMOTE, FeatureHRESET, FeaturePTWRITE, FeatureSERIALIZE, diff --git a/llvm/lib/Target/X86/X86EvexToVex.cpp b/llvm/lib/Target/X86/X86EvexToVex.cpp --- a/llvm/lib/Target/X86/X86EvexToVex.cpp +++ b/llvm/lib/Target/X86/X86EvexToVex.cpp @@ -85,6 +85,8 @@ private: /// Machine instruction info used throughout the class. const X86InstrInfo *TII = nullptr; + + const X86Subtarget *ST = nullptr; }; } // end anonymous namespace @@ -94,8 +96,8 @@ bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); - const X86Subtarget &ST = MF.getSubtarget(); - if (!ST.hasAVX512()) + ST = &MF.getSubtarget(); + if (!ST->hasAVX512()) return false; bool Changed = false; @@ -144,10 +146,29 @@ } // Do any custom cleanup needed to finalize the conversion. -static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) { +static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc, + const X86Subtarget *ST) { (void)NewOpc; unsigned Opc = MI.getOpcode(); switch (Opc) { + case X86::VPDPBUSDSZ256m: + case X86::VPDPBUSDSZ256r: + case X86::VPDPBUSDSZ128m: + case X86::VPDPBUSDSZ128r: + case X86::VPDPBUSDZ256m: + case X86::VPDPBUSDZ256r: + case X86::VPDPBUSDZ128m: + case X86::VPDPBUSDZ128r: + case X86::VPDPWSSDSZ256m: + case X86::VPDPWSSDSZ256r: + case X86::VPDPWSSDSZ128m: + case X86::VPDPWSSDSZ128r: + case X86::VPDPWSSDZ256m: + case X86::VPDPWSSDZ256r: + case X86::VPDPWSSDZ128m: + case X86::VPDPWSSDZ128r: + // These can only VEX convert if AVXVNNI is enabled. + return ST->hasAVXVNNI(); case X86::VALIGNDZ128rri: case X86::VALIGNDZ128rmi: case X86::VALIGNQZ128rri: @@ -259,7 +280,7 @@ if (usesExtendedRegister(MI)) return false; - if (!performCustomAdjustments(MI, NewOpc)) + if (!performCustomAdjustments(MI, NewOpc, ST)) return false; MI.setDesc(TII->get(NewOpc)); diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -3748,18 +3748,26 @@ { X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 }, { X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 }, { X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 }, + { X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0 }, { X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0 }, { X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256m, 0 }, { X86::VPDPBUSDSZr, X86::VPDPBUSDSZm, 0 }, + { X86::VPDPBUSDSrr, X86::VPDPBUSDSrm, 0 }, + { X86::VPDPBUSDYrr, X86::VPDPBUSDYrm, 0 }, { X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128m, 0 }, { X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256m, 0 }, { X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0 }, + { X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0 }, + { X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0 }, { X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0 }, { X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256m, 0 }, { X86::VPDPWSSDSZr, X86::VPDPWSSDSZm, 0 }, + { X86::VPDPWSSDSrr, X86::VPDPWSSDSrm, 0 }, + { X86::VPDPWSSDYrr, X86::VPDPWSSDYrm, 0 }, { X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128m, 0 }, { X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256m, 0 }, { X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0 }, + { X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0 }, { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 }, { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 }, { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 }, diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td --- a/llvm/lib/Target/X86/X86InstrFormats.td +++ b/llvm/lib/Target/X86/X86InstrFormats.td @@ -264,6 +264,9 @@ // Prevent EVEX->VEX conversion from considering this instruction. class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; } +// Force the instruction to use VEX encoding. +class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; } + class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, string AsmStr, Domain d = GenericDomain> : Instruction { @@ -348,6 +351,7 @@ bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction? bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion. + bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding. // TSFlags layout should be kept in sync with X86BaseInfo.h. let TSFlags{6-0} = FormBits; @@ -376,6 +380,7 @@ let TSFlags{51-45} = CD8_Scale; let TSFlags{52} = hasEVEX_RC; let TSFlags{53} = hasNoTrackPrefix; + let TSFlags{54} = ExplicitVEXPrefix; } class PseudoI pattern> diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2568,6 +2568,10 @@ case X86::VPTERNLOGQZ256rmbikz: case X86::VPTERNLOGQZrmbikz: return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + case X86::VPDPWSSDYrr: + case X86::VPDPWSSDrr: + case X86::VPDPWSSDSYrr: + case X86::VPDPWSSDSrr: case X86::VPDPWSSDZ128r: case X86::VPDPWSSDZ128rk: case X86::VPDPWSSDZ128rkz: diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -910,6 +910,8 @@ def HasVNNI : Predicate<"Subtarget->hasVNNI()">; def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">; def HasBF16 : Predicate<"Subtarget->hasBF16()">; +def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">; +def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">; def HasBITALG : Predicate<"Subtarget->hasBITALG()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7164,6 +7164,48 @@ int_x86_avx_maskstore_pd_256, WriteFMaskMove64, WriteFMaskMove64Y>; +//===----------------------------------------------------------------------===// +// AVX_VNNI +//===----------------------------------------------------------------------===// +let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in +multiclass avx_vnni_rm opc, string OpcodeStr, SDNode OpNode, + bit IsCommutable> { + let isCommutable = IsCommutable in + def rr : AVX8I, + VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; + + def rm : AVX8I, + VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; + + let isCommutable = IsCommutable in + def Yrr : AVX8I, + VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; + + def Yrm : AVX8I, + VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; +} + +defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix; +defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix; +defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>, ExplicitVEXPrefix; +defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix; + //===----------------------------------------------------------------------===// // VPERMIL - Permute Single and Double Floating-Point Values // diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -355,6 +355,9 @@ /// Processor has AVX-512 Vector Neural Network Instructions bool HasVNNI = false; + /// Processor has AVX Vector Neural Network Instructions + bool HasAVXVNNI = false; + /// Processor has AVX-512 bfloat16 floating-point extensions bool HasBF16 = false; @@ -750,6 +753,7 @@ bool useRetpolineIndirectBranches() const { return UseRetpolineIndirectBranches; } + bool hasAVXVNNI() const { return HasAVXVNNI; } bool hasAMXTILE() const { return HasAMXTILE; } bool hasAMXBF16() const { return HasAMXBF16; } bool hasAMXINT8() const { return HasAMXINT8; } diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI + +declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_256: +; AVXVNNI: # %bb.0: +; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x50,0xc2] +; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusd_256: +; AVX512VNNI: # %bb.0: +; AVX512VNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x50,0xc2] +; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_128: +; AVXVNNI: # %bb.0: +; AVXVNNI-NEXT: {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x50,0xc2] +; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusd_128: +; AVX512VNNI: # %bb.0: +; AVX512VNNI-NEXT: {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x50,0xc2] +; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_256: +; AVXVNNI: # %bb.0: +; AVXVNNI-NEXT: {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x51,0xc2] +; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusds_256: +; AVX512VNNI: # %bb.0: +; AVX512VNNI-NEXT: {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x51,0xc2] +; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_128: +; AVXVNNI: # %bb.0: +; AVXVNNI-NEXT: {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x51,0xc2] +; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusds_128: +; AVX512VNNI: # %bb.0: +; AVX512VNNI-NEXT: {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x51,0xc2] +; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256: +; AVXVNNI: # %bb.0: +; AVXVNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2] +; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssd_256: +; AVX512VNNI: # %bb.0: +; AVX512VNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2] +; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128: +; AVXVNNI: # %bb.0: +; AVXVNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2] +; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssd_128: +; AVX512VNNI: # %bb.0: +; AVX512VNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2] +; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256: +; AVXVNNI: # %bb.0: +; AVXVNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2] +; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssds_256: +; AVX512VNNI: # %bb.0: +; AVX512VNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2] +; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128: +; AVXVNNI: # %bb.0: +; AVXVNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2] +; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssds_128: +; AVX512VNNI: # %bb.0: +; AVX512VNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2] +; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll @@ -0,0 +1,242 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnni < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpwssd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpwssd_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + ret <4 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpwssd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + ret <8 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + ret <8 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpwssds: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpwssds_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + ret <4 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpwssds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + ret <8 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + ret <8 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbusd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbusd_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: {vex} vpdpbusd %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + ret <4 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbusd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + ret <8 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbusd_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; CHECK-NEXT: {vex} vpdpbusd %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + ret <8 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbusds: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} + +define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbusds_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: {vex} vpdpbusds %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + ret <4 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbusds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + ret <8 x i32> %2 +} + +define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +; CHECK-LABEL: stack_fold_vpdpbusds_256_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; CHECK-NEXT: {vex} vpdpbusds %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + ret <8 x i32> %2 +} diff --git a/llvm/test/MC/Disassembler/X86/avx_vnni.txt b/llvm/test/MC/Disassembler/X86/avx_vnni.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx_vnni.txt @@ -0,0 +1,170 @@ +# RUN: llvm-mc --disassemble %s -triple=i686-apple-darwin9 | FileCheck %s + +# CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0xf4 + +# CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0xf4 + +# CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusd (%eax), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0x30 + +# CHECK: {vex} vpdpbusd -1024(,%ebp,2), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpbusd 4064(%ecx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpbusd -4096(%edx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusd (%eax), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0x30 + +# CHECK: {vex} vpdpbusd -512(,%ebp,2), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpbusd 2032(%ecx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpbusd -2048(%edx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0xf4 + +# CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0xf4 + +# CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusds (%eax), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0x30 + +# CHECK: {vex} vpdpbusds -1024(,%ebp,2), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpbusds 4064(%ecx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpbusds -4096(%edx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusds (%eax), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0x30 + +# CHECK: {vex} vpdpbusds -512(,%ebp,2), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpbusds 2032(%ecx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpbusds -2048(%edx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0xf4 + +# CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0xf4 + +# CHECK: {vex} vpdpwssd 268435456(%esp,%esi,8), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssd 291(%edi,%eax,4), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssd (%eax), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0x30 + +# CHECK: {vex} vpdpwssd -1024(,%ebp,2), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpwssd 4064(%ecx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpwssd -4096(%edx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpwssd 268435456(%esp,%esi,8), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssd 291(%edi,%eax,4), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssd (%eax), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0x30 + +# CHECK: {vex} vpdpwssd -512(,%ebp,2), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpwssd 2032(%ecx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpwssd -2048(%edx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0xf4 + +# CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0xf4 + +# CHECK: {vex} vpdpwssds 268435456(%esp,%esi,8), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssds 291(%edi,%eax,4), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssds (%eax), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0x30 + +# CHECK: {vex} vpdpwssds -1024(,%ebp,2), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpwssds 4064(%ecx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpwssds -4096(%edx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpwssds 268435456(%esp,%esi,8), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssds 291(%edi,%eax,4), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssds (%eax), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0x30 + +# CHECK: {vex} vpdpwssds -512(,%ebp,2), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpwssds 2032(%ecx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpwssds -2048(%edx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff + diff --git a/llvm/test/MC/Disassembler/X86/intel-syntax-avx_vnni.txt b/llvm/test/MC/Disassembler/X86/intel-syntax-avx_vnni.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/intel-syntax-avx_vnni.txt @@ -0,0 +1,170 @@ +# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4 +0xc4,0xe2,0x55,0x50,0xf4 + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4 +0xc4,0xe2,0x51,0x50,0xf4 + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax] +0xc4,0xe2,0x55,0x50,0x30 + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064] +0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096] +0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax] +0xc4,0xe2,0x51,0x50,0x30 + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032] +0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048] +0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4 +0xc4,0xe2,0x55,0x51,0xf4 + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4 +0xc4,0xe2,0x51,0x51,0xf4 + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax] +0xc4,0xe2,0x55,0x51,0x30 + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064] +0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096] +0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax] +0xc4,0xe2,0x51,0x51,0x30 + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032] +0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048] +0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4 +0xc4,0xe2,0x55,0x52,0xf4 + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4 +0xc4,0xe2,0x51,0x52,0xf4 + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax] +0xc4,0xe2,0x55,0x52,0x30 + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064] +0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096] +0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax] +0xc4,0xe2,0x51,0x52,0x30 + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032] +0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048] +0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4 +0xc4,0xe2,0x55,0x53,0xf4 + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4 +0xc4,0xe2,0x51,0x53,0xf4 + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax] +0xc4,0xe2,0x55,0x53,0x30 + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024] +0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064] +0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096] +0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] +0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax] +0xc4,0xe2,0x51,0x53,0x30 + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512] +0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032] +0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048] +0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff + diff --git a/llvm/test/MC/Disassembler/X86/intel-syntax-x86-64-avx_vnni.txt b/llvm/test/MC/Disassembler/X86/intel-syntax-x86-64-avx_vnni.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/intel-syntax-x86-64-avx_vnni.txt @@ -0,0 +1,170 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4 +0xc4,0xe2,0x55,0x50,0xf4 + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4 +0xc4,0xe2,0x51,0x50,0xf4 + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] +0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip] +0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024] +0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064] +0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096] +0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] +0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip] +0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512] +0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032] +0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048] +0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4 +0xc4,0xe2,0x55,0x51,0xf4 + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4 +0xc4,0xe2,0x51,0x51,0xf4 + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] +0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip] +0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024] +0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064] +0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096] +0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] +0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip] +0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512] +0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032] +0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048] +0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4 +0xc4,0xe2,0x55,0x52,0xf4 + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4 +0xc4,0xe2,0x51,0x52,0xf4 + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] +0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip] +0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024] +0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064] +0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096] +0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] +0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip] +0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512] +0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032] +0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048] +0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4 +0xc4,0xe2,0x55,0x53,0xf4 + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4 +0xc4,0xe2,0x51,0x53,0xf4 + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] +0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip] +0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024] +0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064] +0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096] +0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] +0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip] +0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512] +0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032] +0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048] +0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff + diff --git a/llvm/test/MC/Disassembler/X86/x86-64-avx_vnni.txt b/llvm/test/MC/Disassembler/X86/x86-64-avx_vnni.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/x86-64-avx_vnni.txt @@ -0,0 +1,170 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 | FileCheck %s + +# CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0xf4 + +# CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0xf4 + +# CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %ymm5, %ymm6 +0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %ymm5, %ymm6 +0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusd (%rip), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpbusd -1024(,%rbp,2), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpbusd 4064(%rcx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpbusd -4096(%rdx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %xmm5, %xmm6 +0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %xmm5, %xmm6 +0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusd (%rip), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpbusd -512(,%rbp,2), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpbusd 2032(%rcx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpbusd -2048(%rdx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0xf4 + +# CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0xf4 + +# CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %ymm5, %ymm6 +0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %ymm5, %ymm6 +0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusds (%rip), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpbusds -1024(,%rbp,2), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpbusds 4064(%rcx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpbusds -4096(%rdx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %xmm5, %xmm6 +0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %xmm5, %xmm6 +0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpbusds (%rip), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpbusds -512(,%rbp,2), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpbusds 2032(%rcx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpbusds -2048(%rdx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0xf4 + +# CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0xf4 + +# CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %ymm5, %ymm6 +0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %ymm5, %ymm6 +0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssd (%rip), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpwssd -1024(,%rbp,2), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpwssd 4064(%rcx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpwssd -4096(%rdx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %xmm5, %xmm6 +0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %xmm5, %xmm6 +0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssd (%rip), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpwssd -512(,%rbp,2), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpwssd 2032(%rcx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpwssd -2048(%rdx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff + +# CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0xf4 + +# CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0xf4 + +# CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %ymm5, %ymm6 +0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %ymm5, %ymm6 +0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssds (%rip), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpwssds -1024(,%rbp,2), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff + +# CHECK: {vex} vpdpwssds 4064(%rcx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00 + +# CHECK: {vex} vpdpwssds -4096(%rdx), %ymm5, %ymm6 +0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff + +# CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %xmm5, %xmm6 +0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %xmm5, %xmm6 +0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: {vex} vpdpwssds (%rip), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00 + +# CHECK: {vex} vpdpwssds -512(,%rbp,2), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff + +# CHECK: {vex} vpdpwssds 2032(%rcx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00 + +# CHECK: {vex} vpdpwssds -2048(%rdx), %xmm5, %xmm6 +0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff + diff --git a/llvm/test/MC/X86/avx_vnni-encoding.s b/llvm/test/MC/X86/avx_vnni-encoding.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx_vnni-encoding.s @@ -0,0 +1,226 @@ +// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnni --show-encoding < %s | FileCheck %s + +// CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4] + {vex} vpdpbusd %ymm4, %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4] + {vex} vpdpbusd %xmm4, %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpbusd 268435456(%esp,%esi,8), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpbusd 291(%edi,%eax,4), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd (%eax), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x30] + {vex} vpdpbusd (%eax), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd -1024(,%ebp,2), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpbusd -1024(,%ebp,2), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpbusd 4064(%ecx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd -4096(%edx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpbusd -4096(%edx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpbusd 268435456(%esp,%esi,8), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpbusd 291(%edi,%eax,4), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd (%eax), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x30] + {vex} vpdpbusd (%eax), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd -512(,%ebp,2), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpbusd -512(,%ebp,2), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpbusd 2032(%ecx), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd -2048(%edx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpbusd -2048(%edx), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4] + {vex} vpdpbusds %ymm4, %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4] + {vex} vpdpbusds %xmm4, %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpbusds 268435456(%esp,%esi,8), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpbusds 291(%edi,%eax,4), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds (%eax), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x30] + {vex} vpdpbusds (%eax), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds -1024(,%ebp,2), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpbusds -1024(,%ebp,2), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpbusds 4064(%ecx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds -4096(%edx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpbusds -4096(%edx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpbusds 268435456(%esp,%esi,8), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpbusds 291(%edi,%eax,4), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds (%eax), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x30] + {vex} vpdpbusds (%eax), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds -512(,%ebp,2), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpbusds -512(,%ebp,2), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpbusds 2032(%ecx), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds -2048(%edx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpbusds -2048(%edx), %xmm5, %xmm6 + +// CHECK: vpdpwssd %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4] + {vex} vpdpwssd %ymm4, %ymm5, %ymm6 + +// CHECK: vpdpwssd %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4] + {vex} vpdpwssd %xmm4, %xmm5, %xmm6 + +// CHECK: vpdpwssd 268435456(%esp,%esi,8), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpwssd 268435456(%esp,%esi,8), %ymm5, %ymm6 + +// CHECK: vpdpwssd 291(%edi,%eax,4), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpwssd 291(%edi,%eax,4), %ymm5, %ymm6 + +// CHECK: vpdpwssd (%eax), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x30] + {vex} vpdpwssd (%eax), %ymm5, %ymm6 + +// CHECK: vpdpwssd -1024(,%ebp,2), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpwssd -1024(,%ebp,2), %ymm5, %ymm6 + +// CHECK: vpdpwssd 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpwssd 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vpdpwssd -4096(%edx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpwssd -4096(%edx), %ymm5, %ymm6 + +// CHECK: vpdpwssd 268435456(%esp,%esi,8), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpwssd 268435456(%esp,%esi,8), %xmm5, %xmm6 + +// CHECK: vpdpwssd 291(%edi,%eax,4), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpwssd 291(%edi,%eax,4), %xmm5, %xmm6 + +// CHECK: vpdpwssd (%eax), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x30] + {vex} vpdpwssd (%eax), %xmm5, %xmm6 + +// CHECK: vpdpwssd -512(,%ebp,2), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpwssd -512(,%ebp,2), %xmm5, %xmm6 + +// CHECK: vpdpwssd 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpwssd 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vpdpwssd -2048(%edx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpwssd -2048(%edx), %xmm5, %xmm6 + +// CHECK: vpdpwssds %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4] + {vex} vpdpwssds %ymm4, %ymm5, %ymm6 + +// CHECK: vpdpwssds %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4] + {vex} vpdpwssds %xmm4, %xmm5, %xmm6 + +// CHECK: vpdpwssds 268435456(%esp,%esi,8), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpwssds 268435456(%esp,%esi,8), %ymm5, %ymm6 + +// CHECK: vpdpwssds 291(%edi,%eax,4), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpwssds 291(%edi,%eax,4), %ymm5, %ymm6 + +// CHECK: vpdpwssds (%eax), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x30] + {vex} vpdpwssds (%eax), %ymm5, %ymm6 + +// CHECK: vpdpwssds -1024(,%ebp,2), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpwssds -1024(,%ebp,2), %ymm5, %ymm6 + +// CHECK: vpdpwssds 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpwssds 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vpdpwssds -4096(%edx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpwssds -4096(%edx), %ymm5, %ymm6 + +// CHECK: vpdpwssds 268435456(%esp,%esi,8), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpwssds 268435456(%esp,%esi,8), %xmm5, %xmm6 + +// CHECK: vpdpwssds 291(%edi,%eax,4), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpwssds 291(%edi,%eax,4), %xmm5, %xmm6 + +// CHECK: vpdpwssds (%eax), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x30] + {vex} vpdpwssds (%eax), %xmm5, %xmm6 + +// CHECK: vpdpwssds -512(,%ebp,2), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpwssds -512(,%ebp,2), %xmm5, %xmm6 + +// CHECK: vpdpwssds 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpwssds 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vpdpwssds -2048(%edx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpwssds -2048(%edx), %xmm5, %xmm6 + diff --git a/llvm/test/MC/X86/intel-syntax-avx_vnni.s b/llvm/test/MC/X86/intel-syntax-avx_vnni.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/intel-syntax-avx_vnni.s @@ -0,0 +1,226 @@ +// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnni -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4] + {vex} vpdpbusd ymm6, ymm5, ymm4 + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4] + {vex} vpdpbusd xmm6, xmm5, xmm4 + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x30] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax] + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024] + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064] + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x30] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4] + {vex} vpdpbusds ymm6, ymm5, ymm4 + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4] + {vex} vpdpbusds xmm6, xmm5, xmm4 + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x30] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x30] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4] + {vex} vpdpwssd ymm6, ymm5, ymm4 + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4] + {vex} vpdpwssd xmm6, xmm5, xmm4 + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x30] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x30] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4] + {vex} vpdpwssds ymm6, ymm5, ymm4 + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4] + {vex} vpdpwssds xmm6, xmm5, xmm4 + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x30] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax] +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x30] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048] + diff --git a/llvm/test/MC/X86/intel-syntax-x86-64-avx_vnni.s b/llvm/test/MC/X86/intel-syntax-x86-64-avx_vnni.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/intel-syntax-x86-64-avx_vnni.s @@ -0,0 +1,226 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnni -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4] + {vex} vpdpbusd ymm6, ymm5, ymm4 + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4] + {vex} vpdpbusd xmm6, xmm5, xmm4 + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip] + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024] + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064] + +// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032] + +// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4] + {vex} vpdpbusds ymm6, ymm5, ymm4 + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4] + {vex} vpdpbusds xmm6, xmm5, xmm4 + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064] + +// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032] + +// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4] + {vex} vpdpwssd ymm6, ymm5, ymm4 + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4] + {vex} vpdpwssd xmm6, xmm5, xmm4 + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064] + +// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032] + +// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4] + {vex} vpdpwssds ymm6, ymm5, ymm4 + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4] + {vex} vpdpwssds xmm6, xmm5, xmm4 + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip] +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064] + +// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip] +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032] + +// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048] + diff --git a/llvm/test/MC/X86/x86-64-avx_vnni-encoding.s b/llvm/test/MC/X86/x86-64-avx_vnni-encoding.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/x86-64-avx_vnni-encoding.s @@ -0,0 +1,226 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnni --show-encoding < %s | FileCheck %s + +// CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4] + {vex} vpdpbusd %ymm4, %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4] + {vex} vpdpbusd %xmm4, %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpbusd 268435456(%rbp,%r14,8), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpbusd 291(%r8,%rax,4), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd (%rip), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpbusd (%rip), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd -1024(,%rbp,2), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpbusd -1024(,%rbp,2), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd 4064(%rcx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpbusd 4064(%rcx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd -4096(%rdx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpbusd -4096(%rdx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpbusd 268435456(%rbp,%r14,8), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpbusd 291(%r8,%rax,4), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd (%rip), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpbusd (%rip), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd -512(,%rbp,2), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpbusd -512(,%rbp,2), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd 2032(%rcx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpbusd 2032(%rcx), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusd -2048(%rdx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpbusd -2048(%rdx), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4] + {vex} vpdpbusds %ymm4, %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4] + {vex} vpdpbusds %xmm4, %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpbusds 268435456(%rbp,%r14,8), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpbusds 291(%r8,%rax,4), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds (%rip), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpbusds (%rip), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds -1024(,%rbp,2), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpbusds -1024(,%rbp,2), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds 4064(%rcx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpbusds 4064(%rcx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds -4096(%rdx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpbusds -4096(%rdx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpbusds 268435456(%rbp,%r14,8), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpbusds 291(%r8,%rax,4), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds (%rip), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpbusds (%rip), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds -512(,%rbp,2), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpbusds -512(,%rbp,2), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds 2032(%rcx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpbusds 2032(%rcx), %xmm5, %xmm6 + +// CHECK: {vex} vpdpbusds -2048(%rdx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpbusds -2048(%rdx), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4] + {vex} vpdpwssd %ymm4, %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4] + {vex} vpdpwssd %xmm4, %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpwssd 268435456(%rbp,%r14,8), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpwssd 291(%r8,%rax,4), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssd (%rip), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpwssd (%rip), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssd -1024(,%rbp,2), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpwssd -1024(,%rbp,2), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssd 4064(%rcx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpwssd 4064(%rcx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssd -4096(%rdx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpwssd -4096(%rdx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpwssd 268435456(%rbp,%r14,8), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpwssd 291(%r8,%rax,4), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssd (%rip), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpwssd (%rip), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssd -512(,%rbp,2), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpwssd -512(,%rbp,2), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssd 2032(%rcx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpwssd 2032(%rcx), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssd -2048(%rdx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpwssd -2048(%rdx), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4] + {vex} vpdpwssds %ymm4, %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4] + {vex} vpdpwssds %xmm4, %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpwssds 268435456(%rbp,%r14,8), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpwssds 291(%r8,%rax,4), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssds (%rip), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpwssds (%rip), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssds -1024(,%rbp,2), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff] + {vex} vpdpwssds -1024(,%rbp,2), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssds 4064(%rcx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00] + {vex} vpdpwssds 4064(%rcx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssds -4096(%rdx), %ymm5, %ymm6 +// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff] + {vex} vpdpwssds -4096(%rdx), %ymm5, %ymm6 + +// CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10] + {vex} vpdpwssds 268435456(%rbp,%r14,8), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00] + {vex} vpdpwssds 291(%r8,%rax,4), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssds (%rip), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00] + {vex} vpdpwssds (%rip), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssds -512(,%rbp,2), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff] + {vex} vpdpwssds -512(,%rbp,2), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssds 2032(%rcx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00] + {vex} vpdpwssds 2032(%rcx), %xmm5, %xmm6 + +// CHECK: {vex} vpdpwssds -2048(%rdx), %xmm5, %xmm6 +// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff] + {vex} vpdpwssds -2048(%rdx), %xmm5, %xmm6 +