diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -432,13 +432,13 @@ // GFNI TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v16qi, "V16cV16cV16cIc", "ncV:128:", "gfni") TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v32qi, "V32cV32cV32cIc", "ncV:256:", "avx,gfni") -TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v64qi, "V64cV64cV64cIc", "ncV:512:", "avx512bw,gfni") +TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v64qi, "V64cV64cV64cIc", "ncV:512:", "avx512f,gfni") TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v16qi, "V16cV16cV16cIc", "ncV:128:", "gfni") TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v32qi, "V32cV32cV32cIc", "ncV:256:", "avx,gfni") -TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v64qi, "V64cV64cV64cIc", "ncV:512:", "avx512bw,gfni") +TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v64qi, "V64cV64cV64cIc", "ncV:512:", "avx512f,gfni") TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v16qi, "V16cV16cV16c", "ncV:128:", "gfni") TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v32qi, "V32cV32cV32c", "ncV:256:", "avx,gfni") -TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v64qi, "V64cV64cV64c", "ncV:512:", "avx512bw,gfni") +TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v64qi, "V64cV64cV64c", "ncV:512:", "avx512f,gfni") // CLMUL TARGET_BUILTIN(__builtin_ia32_pclmulqdq128, "V2OiV2OiV2OiIc", "ncV:128:", "pclmul") diff --git a/clang/lib/Headers/gfniintrin.h b/clang/lib/Headers/gfniintrin.h --- a/clang/lib/Headers/gfniintrin.h +++ b/clang/lib/Headers/gfniintrin.h @@ -20,10 +20,12 @@ /* Default attributes for YMM unmasked form. */ #define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256))) -/* Default attributes for ZMM forms. */ -#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512))) +/* Default attributes for ZMM unmasked forms. */ +#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512f,gfni"), __min_vector_width__(512))) +/* Default attributes for ZMM masked forms. */ +#define __DEFAULT_FN_ATTRS_Z_MASK __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512))) -/* Default attributes for VLX forms. */ +/* Default attributes for VLX masked forms. */ #define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128))) #define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256))) @@ -99,7 +101,7 @@ (__v64qi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_Z +static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_selectb_512(__U, @@ -107,7 +109,7 @@ (__v64qi) __S); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_Z +static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(), diff --git a/clang/test/CodeGen/X86/gfni-builtins.c b/clang/test/CodeGen/X86/gfni-builtins.c --- a/clang/test/CodeGen/X86/gfni-builtins.c +++ b/clang/test/CodeGen/X86/gfni-builtins.c @@ -1,6 +1,7 @@ // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -emit-llvm -o - | FileCheck %s --check-prefix SSE // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX,AVX512 +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -target-feature +avx512f -target-feature +avx512vl -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX,AVX512 +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX,AVX512,AVX512BW #include @@ -42,141 +43,150 @@ } #endif // __AVX__ -#ifdef __AVX512BW__ +#ifdef __AVX512F__ __m512i test_mm512_gf2p8affineinv_epi64_epi8(__m512i A, __m512i B) { // AVX512-LABEL: @test_mm512_gf2p8affineinv_epi64_epi8 // AVX512: @llvm.x86.vgf2p8affineinvqb.512 return _mm512_gf2p8affineinv_epi64_epi8(A, B, 1); } -__m512i test_mm512_mask_gf2p8affineinv_epi64_epi8(__m512i S, __mmask64 U, __m512i A, __m512i B) { - // AVX512-LABEL: @test_mm512_mask_gf2p8affineinv_epi64_epi8 - // AVX512: @llvm.x86.vgf2p8affineinvqb.512 - // AVX512: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} - return _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, 1); +__m512i test_mm512_gf2p8affine_epi64_epi8(__m512i A, __m512i B) { + // AVX512-LABEL: @test_mm512_gf2p8affine_epi64_epi8 + // AVX512: @llvm.x86.vgf2p8affineqb.512 + return _mm512_gf2p8affine_epi64_epi8(A, B, 1); } -__m512i test_mm512_maskz_gf2p8affineinv_epi64_epi8(__mmask64 U, __m512i A, __m512i B) { - // AVX512-LABEL: @test_mm512_maskz_gf2p8affineinv_epi64_epi8 - // AVX512: @llvm.x86.vgf2p8affineinvqb.512 - // AVX512: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} - return _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, 1); +__m512i test_mm512_gf2p8mul_epi8(__m512i A, __m512i B) { + // AVX512-LABEL: @test_mm512_gf2p8mul_epi8 + // AVX512: @llvm.x86.vgf2p8mulb.512 + return _mm512_gf2p8mul_epi8(A, B); } +#endif // __AVX512F__ -__m256i test_mm256_mask_gf2p8affineinv_epi64_epi8(__m256i S, __mmask32 U, __m256i A, __m256i B) { - // AVX256-LABEL: @test_mm256_mask_gf2p8affineinv_epi64_epi8 - // AVX256: @llvm.x86.vgf2p8affineinvqb.256 - // AVX256: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} - return _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, 1); +#ifdef __AVX512BW__ +__m512i test_mm512_mask_gf2p8affineinv_epi64_epi8(__m512i S, __mmask64 U, __m512i A, __m512i B) { + // AVX512BW-LABEL: @test_mm512_mask_gf2p8affineinv_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineinvqb.512 + // AVX512BW: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} + return _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, 1); } -__m256i test_mm256_maskz_gf2p8affineinv_epi64_epi8(__mmask32 U, __m256i A, __m256i B) { - // AVX256-LABEL: @test_mm256_maskz_gf2p8affineinv_epi64_epi8 - // AVX256: @llvm.x86.vgf2p8affineinvqb.256 - // AVX256: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} - return _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, 1); +__m512i test_mm512_maskz_gf2p8affineinv_epi64_epi8(__mmask64 U, __m512i A, __m512i B) { + // AVX512BW-LABEL: @test_mm512_maskz_gf2p8affineinv_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineinvqb.512 + // AVX512BW: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} + return _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, 1); } __m128i test_mm_mask_gf2p8affineinv_epi64_epi8(__m128i S, __mmask16 U, __m128i A, __m128i B) { - // AVX512-LABEL: @test_mm_mask_gf2p8affineinv_epi64_epi8 - // AVX512: @llvm.x86.vgf2p8affineinvqb.128 - // AVX512: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}} + // AVX512BW-LABEL: @test_mm_mask_gf2p8affineinv_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineinvqb.128 + // AVX512BW: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}} return _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, 1); } __m128i test_mm_maskz_gf2p8affineinv_epi64_epi8(__mmask16 U, __m128i A, __m128i B) { - // AVX512-LABEL: @test_mm_maskz_gf2p8affineinv_epi64_epi8 - // AVX512: @llvm.x86.vgf2p8affineinvqb.128 - // AVX512: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}} + // AVX512BW-LABEL: @test_mm_maskz_gf2p8affineinv_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineinvqb.128 + // AVX512BW: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}} return _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, 1); } -__m512i test_mm512_gf2p8affine_epi64_epi8(__m512i A, __m512i B) { - // AVX512-LABEL: @test_mm512_gf2p8affine_epi64_epi8 - // AVX512: @llvm.x86.vgf2p8affineqb.512 - return _mm512_gf2p8affine_epi64_epi8(A, B, 1); +__m256i test_mm256_mask_gf2p8affineinv_epi64_epi8(__m256i S, __mmask32 U, __m256i A, __m256i B) { + // AVX512BW-LABEL: @test_mm256_mask_gf2p8affineinv_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineinvqb.256 + // AVX512BW: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} + return _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, 1); +} + +__m256i test_mm256_maskz_gf2p8affineinv_epi64_epi8(__mmask32 U, __m256i A, __m256i B) { + // AVX512BW-LABEL: @test_mm256_maskz_gf2p8affineinv_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineinvqb.256 + // AVX512BW: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} + return _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, 1); } __m512i test_mm512_mask_gf2p8affine_epi64_epi8(__m512i S, __mmask64 U, __m512i A, __m512i B) { - // AVX512-LABEL: @test_mm512_mask_gf2p8affine_epi64_epi8 - // AVX512: @llvm.x86.vgf2p8affineqb.512 - // AVX512: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} + // AVX512BW-LABEL: @test_mm512_mask_gf2p8affine_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineqb.512 + // AVX512BW: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} return _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, 1); } __m512i test_mm512_maskz_gf2p8affine_epi64_epi8(__mmask64 U, __m512i A, __m512i B) { - // AVX512-LABEL: @test_mm512_maskz_gf2p8affine_epi64_epi8 - // AVX512: @llvm.x86.vgf2p8affineqb.512 - // AVX512: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} + // AVX512BW-LABEL: @test_mm512_maskz_gf2p8affine_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineqb.512 + // AVX512BW: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} return _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, 1); } -__m256i test_mm256_mask_gf2p8affine_epi64_epi8(__m256i S, __mmask32 U, __m256i A, __m256i B) { - // AVX256-LABEL: @test_mm256_mask_gf2p8affine_epi64_epi8 - // AVX256: @llvm.x86.vgf2p8affineqb.256 - // AVX256: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} - return _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, 1); -} - -__m256i test_mm256_maskz_gf2p8affine_epi64_epi8(__mmask32 U, __m256i A, __m256i B) { - // AVX256-LABEL: @test_mm256_maskz_gf2p8affine_epi64_epi8 - // AVX256: @llvm.x86.vgf2p8affineqb.256 - // AVX256: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} - return _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, 1); -} - __m128i test_mm_mask_gf2p8affine_epi64_epi8(__m128i S, __mmask16 U, __m128i A, __m128i B) { - // AVX512-LABEL: @test_mm_mask_gf2p8affine_epi64_epi8 - // AVX512: @llvm.x86.vgf2p8affineqb.128 - // AVX512: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}} + // AVX512BW-LABEL: @test_mm_mask_gf2p8affine_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineqb.128 + // AVX512BW: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}} return _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, 1); } __m128i test_mm_maskz_gf2p8affine_epi64_epi8(__mmask16 U, __m128i A, __m128i B) { - // AVX512-LABEL: @test_mm_maskz_gf2p8affine_epi64_epi8 - // AVX512: @llvm.x86.vgf2p8affineqb.128 - // AVX512: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}} + // AVX512BW-LABEL: @test_mm_maskz_gf2p8affine_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineqb.128 + // AVX512BW: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}} return _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, 1); } -__m512i test_mm512_gf2p8mul_epi8(__m512i A, __m512i B) { - // AVX512-LABEL: @test_mm512_gf2p8mul_epi8 - // AVX512: @llvm.x86.vgf2p8mulb.512 - return _mm512_gf2p8mul_epi8(A, B); +__m256i test_mm256_mask_gf2p8affine_epi64_epi8(__m256i S, __mmask32 U, __m256i A, __m256i B) { + // AVX512BW-LABEL: @test_mm256_mask_gf2p8affine_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineqb.256 + // AVX512BW: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} + return _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, 1); +} + +__m256i test_mm256_maskz_gf2p8affine_epi64_epi8(__mmask32 U, __m256i A, __m256i B) { + // AVX512BW-LABEL: @test_mm256_maskz_gf2p8affine_epi64_epi8 + // AVX512BW: @llvm.x86.vgf2p8affineqb.256 + // AVX512BW: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} + return _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, 1); } __m512i test_mm512_mask_gf2p8mul_epi8(__m512i S, __mmask64 U, __m512i A, __m512i B) { - // AVX512-LABEL: @test_mm512_mask_gf2p8mul_epi8 - // AVX512: @llvm.x86.vgf2p8mulb.512 - // AVX512: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} + // AVX512BW-LABEL: @test_mm512_mask_gf2p8mul_epi8 + // AVX512BW: @llvm.x86.vgf2p8mulb.512 + // AVX512BW: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} return _mm512_mask_gf2p8mul_epi8(S, U, A, B); } __m512i test_mm512_maskz_gf2p8mul_epi8(__mmask64 U, __m512i A, __m512i B) { - // AVX512-LABEL: @test_mm512_maskz_gf2p8mul_epi8 - // AVX512: @llvm.x86.vgf2p8mulb.512 - // AVX512: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} + // AVX512BW-LABEL: @test_mm512_maskz_gf2p8mul_epi8 + // AVX512BW: @llvm.x86.vgf2p8mulb.512 + // AVX512BW: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{[0-9]+}}, <64 x i8> {{.*}} return _mm512_maskz_gf2p8mul_epi8(U, A, B); } +__m128i test_mm_mask_gf2p8mul_epi8(__m128i S, __mmask16 U, __m128i A, __m128i B) { + // AVX512BW-LABEL: @test_mm_mask_gf2p8mul_epi8 + // AVX512BW: @llvm.x86.vgf2p8mulb.128 + // AVX512BW: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}} + return _mm_mask_gf2p8mul_epi8(S, U, A, B); +} + +__m128i test_mm_maskz_gf2p8mul_epi8(__mmask16 U, __m128i A, __m128i B) { + // AVX512BW-LABEL: @test_mm_maskz_gf2p8mul_epi8 + // AVX512BW: @llvm.x86.vgf2p8mulb.128 + // AVX512BW: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}} + return _mm_maskz_gf2p8mul_epi8(U, A, B); +} + __m256i test_mm256_mask_gf2p8mul_epi8(__m256i S, __mmask32 U, __m256i A, __m256i B) { - // AVX256-LABEL: @test_mm256_mask_gf2p8mul_epi8 - // AVX256: @llvm.x86.vgf2p8mulb.256 - // AVX256: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} + // AVX512BW-LABEL: @test_mm256_mask_gf2p8mul_epi8 + // AVX512BW: @llvm.x86.vgf2p8mulb.256 + // AVX512BW: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} return _mm256_mask_gf2p8mul_epi8(S, U, A, B); } __m256i test_mm256_maskz_gf2p8mul_epi8(__mmask32 U, __m256i A, __m256i B) { - // AVX256-LABEL: @test_mm256_maskz_gf2p8mul_epi8 - // AVX256: @llvm.x86.vgf2p8mulb.256 - // AVX256: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} + // AVX512BW-LABEL: @test_mm256_maskz_gf2p8mul_epi8 + // AVX512BW: @llvm.x86.vgf2p8mulb.256 + // AVX512BW: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{[0-9]+}}, <32 x i8> {{.*}} return _mm256_maskz_gf2p8mul_epi8(U, A, B); } - -__m128i test_mm_mask_gf2p8mul_epi8(__m128i S, __mmask16 U, __m128i A, __m128i B) { - // AVX512-LABEL: @test_mm_mask_gf2p8mul_epi8 - // AVX512: @llvm.x86.vgf2p8mulb.128 - // AVX512: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}} - return _mm_mask_gf2p8mul_epi8(S, U, A, B); -} #endif // __AVX512BW__ diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -12694,10 +12694,10 @@ multiclass GF2P8MULB_avx512_common Op, string OpStr, SDNode OpNode, X86SchedWriteWidths sched> { - let Predicates = [HasGFNI, HasAVX512, HasBWI] in + let Predicates = [HasGFNI, HasAVX512] in defm Z : avx512_binop_rm, EVEX_V512; - let Predicates = [HasGFNI, HasVLX, HasBWI] in { + let Predicates = [HasGFNI, HasVLX] in { defm Z256 : avx512_binop_rm, EVEX_V256; defm Z128 : avx512_binop_rm, @@ -12726,10 +12726,10 @@ multiclass GF2P8AFFINE_avx512_common Op, string OpStr, SDNode OpNode, X86SchedWriteWidths sched> { - let Predicates = [HasGFNI, HasAVX512, HasBWI] in + let Predicates = [HasGFNI, HasAVX512] in defm Z : GF2P8AFFINE_avx512_rmb_imm, EVEX_V512; - let Predicates = [HasGFNI, HasVLX, HasBWI] in { + let Predicates = [HasGFNI, HasVLX] in { defm Z256 : GF2P8AFFINE_avx512_rmb_imm, EVEX_V256; defm Z128 : GF2P8AFFINE_avx512_rmb_imm; - let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { + let Predicates = [HasGFNI, HasAVX, NoVLX] in { defm V#NAME : GF2P8AFFINE_rmi, VEX_4V, VEX_W; defm V#NAME#Y : GF2P8AFFINE_rmi; -let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { +let Predicates = [HasGFNI, HasAVX, NoVLX] in { defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load, i128mem>, VEX_4V; defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load, diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll @@ -1,28 +1,58 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,+gfni,+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+gfni,+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,+gfni,+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+gfni,+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64BW +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,+gfni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86NOBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+gfni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64NOBW declare <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8>, <16 x i8>, i8) define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) { -; X86-LABEL: test_vgf2p8affineinvqb_128: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] -; X86-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04] -; X86-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05] -; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] -; X86-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8affineinvqb_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] -; X64-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04] -; X64-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05] -; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] -; X64-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8affineinvqb_128: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05] +; X86BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8affineinvqb_128: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05] +; X64BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8affineinvqb_128: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] +; X86NOBW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x04] +; X86NOBW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x05] +; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] +; X86NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] +; X86NOBW-NEXT: vpblendvb %xmm5, %xmm0, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x4c,0xd0,0x50] +; X86NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8affineinvqb_128: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] +; X64NOBW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x04] +; X64NOBW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x05] +; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] +; X64NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] +; X64NOBW-NEXT: vpblendvb %xmm5, %xmm0, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x4c,0xd0,0x50] +; X64NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> %2 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3) %3 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 4) @@ -37,25 +67,60 @@ declare <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8>, <32 x i8>, i8) define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) { -; X86-LABEL: test_vgf2p8affineinvqb_256: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] -; X86-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04] -; X86-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05] -; X86-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] -; X86-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8affineinvqb_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] -; X64-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04] -; X64-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05] -; X64-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] -; X64-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8affineinvqb_256: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05] +; X86BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X86BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8affineinvqb_256: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05] +; X64BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X64BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8affineinvqb_256: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] +; X86NOBW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] +; X86NOBW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x04] +; X86NOBW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x05] +; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] +; X86NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] +; X86NOBW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm2 # encoding: [0xc4,0xe3,0x6d,0x4c,0xd0,0x50] +; X86NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8affineinvqb_256: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: shrl $16, %edi # encoding: [0xc1,0xef,0x10] +; X64NOBW-NEXT: kmovw %edi, %k2 # encoding: [0xc5,0xf8,0x92,0xd7] +; X64NOBW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] +; X64NOBW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x04] +; X64NOBW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x05] +; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] +; X64NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] +; X64NOBW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm2 # encoding: [0xc4,0xe3,0x6d,0x4c,0xd0,0x50] +; X64NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3) %3 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 4) @@ -70,25 +135,80 @@ declare <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8>, <64 x i8>, i8) define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) { -; X86-LABEL: test_vgf2p8affineinvqb_512: -; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] -; X86-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04] -; X86-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05] -; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] -; X86-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8affineinvqb_512: -; X64: # %bb.0: -; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] -; X64-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04] -; X64-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05] -; X64-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] -; X64-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8affineinvqb_512: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05] +; X86BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X86BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8affineinvqb_512: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05] +; X64BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X64BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8affineinvqb_512: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x04] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x06] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k3 # encoding: [0xc5,0xf8,0x90,0x5c,0x24,0x0a] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08] +; X86NOBW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x03] +; X86NOBW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xe1,0x04] +; X86NOBW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xd9,0x05] +; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] +; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X86NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff] +; X86NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] +; X86NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] +; X86NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] +; X86NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] +; X86NOBW-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8affineinvqb_512: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: movq %rdi, %rax # encoding: [0x48,0x89,0xf8] +; X64NOBW-NEXT: movl %edi, %ecx # encoding: [0x89,0xf9] +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20] +; X64NOBW-NEXT: shrq $48, %rax # encoding: [0x48,0xc1,0xe8,0x30] +; X64NOBW-NEXT: shrl $16, %ecx # encoding: [0xc1,0xe9,0x10] +; X64NOBW-NEXT: kmovw %ecx, %k2 # encoding: [0xc5,0xf8,0x92,0xd1] +; X64NOBW-NEXT: kmovw %eax, %k3 # encoding: [0xc5,0xf8,0x92,0xd8] +; X64NOBW-NEXT: kmovw %edi, %k4 # encoding: [0xc5,0xf8,0x92,0xe7] +; X64NOBW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] +; X64NOBW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x04] +; X64NOBW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x05] +; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] +; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} # encoding: [0x62,0xf3,0x55,0xc9,0x25,0xed,0xff] +; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X64NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} # encoding: [0x62,0xf3,0x4d,0xca,0x25,0xf6,0xff] +; X64NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] +; X64NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] +; X64NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] +; X64NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] +; X64NOBW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd0,0xb8] +; X64NOBW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3) %3 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 4) @@ -103,25 +223,53 @@ declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8) define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) { -; X86-LABEL: test_vgf2p8affineqb_128: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] -; X86-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04] -; X86-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05] -; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] -; X86-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8affineqb_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] -; X64-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04] -; X64-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05] -; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] -; X64-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8affineqb_128: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05] +; X86BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8affineqb_128: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05] +; X64BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8affineqb_128: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] +; X86NOBW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x04] +; X86NOBW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x05] +; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] +; X86NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] +; X86NOBW-NEXT: vpblendvb %xmm5, %xmm0, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x4c,0xd0,0x50] +; X86NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8affineqb_128: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] +; X64NOBW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x04] +; X64NOBW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x05] +; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] +; X64NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] +; X64NOBW-NEXT: vpblendvb %xmm5, %xmm0, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x4c,0xd0,0x50] +; X64NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> %2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3) %3 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 4) @@ -136,25 +284,60 @@ declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8) define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) { -; X86-LABEL: test_vgf2p8affineqb_256: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] -; X86-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04] -; X86-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05] -; X86-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] -; X86-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8affineqb_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] -; X64-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04] -; X64-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05] -; X64-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] -; X64-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8affineqb_256: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05] +; X86BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X86BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8affineqb_256: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05] +; X64BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X64BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8affineqb_256: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] +; X86NOBW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] +; X86NOBW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x04] +; X86NOBW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x05] +; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] +; X86NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] +; X86NOBW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm2 # encoding: [0xc4,0xe3,0x6d,0x4c,0xd0,0x50] +; X86NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8affineqb_256: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: shrl $16, %edi # encoding: [0xc1,0xef,0x10] +; X64NOBW-NEXT: kmovw %edi, %k2 # encoding: [0xc5,0xf8,0x92,0xd7] +; X64NOBW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] +; X64NOBW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x04] +; X64NOBW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x05] +; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] +; X64NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] +; X64NOBW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm2 # encoding: [0xc4,0xe3,0x6d,0x4c,0xd0,0x50] +; X64NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3) %3 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 4) @@ -169,25 +352,80 @@ declare <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8) define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) { -; X86-LABEL: test_vgf2p8affineqb_512: -; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] -; X86-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04] -; X86-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05] -; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] -; X86-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8affineqb_512: -; X64: # %bb.0: -; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] -; X64-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04] -; X64-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05] -; X64-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] -; X64-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8affineqb_512: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05] +; X86BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X86BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8affineqb_512: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05] +; X64BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X64BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8affineqb_512: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x04] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x06] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k3 # encoding: [0xc5,0xf8,0x90,0x5c,0x24,0x0a] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08] +; X86NOBW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x03] +; X86NOBW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xe1,0x04] +; X86NOBW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xd9,0x05] +; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] +; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X86NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff] +; X86NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] +; X86NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] +; X86NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] +; X86NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] +; X86NOBW-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8affineqb_512: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: movq %rdi, %rax # encoding: [0x48,0x89,0xf8] +; X64NOBW-NEXT: movl %edi, %ecx # encoding: [0x89,0xf9] +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20] +; X64NOBW-NEXT: shrq $48, %rax # encoding: [0x48,0xc1,0xe8,0x30] +; X64NOBW-NEXT: shrl $16, %ecx # encoding: [0xc1,0xe9,0x10] +; X64NOBW-NEXT: kmovw %ecx, %k2 # encoding: [0xc5,0xf8,0x92,0xd1] +; X64NOBW-NEXT: kmovw %eax, %k3 # encoding: [0xc5,0xf8,0x92,0xd8] +; X64NOBW-NEXT: kmovw %edi, %k4 # encoding: [0xc5,0xf8,0x92,0xe7] +; X64NOBW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] +; X64NOBW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x04] +; X64NOBW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x05] +; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] +; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} # encoding: [0x62,0xf3,0x55,0xc9,0x25,0xed,0xff] +; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] +; X64NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} # encoding: [0x62,0xf3,0x4d,0xca,0x25,0xf6,0xff] +; X64NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] +; X64NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] +; X64NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] +; X64NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] +; X64NOBW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd0,0xb8] +; X64NOBW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3) %3 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 4) @@ -211,19 +449,39 @@ } define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) { -; X86-LABEL: test_vgf2p8mulb_128_mask: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xcf,0xd1] -; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8mulb_128_mask: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xcf,0xd1] -; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8mulb_128_mask: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xcf,0xd1] +; X86BW-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8mulb_128_mask: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xcf,0xd1] +; X64BW-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8mulb_128_mask: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc1] +; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X86NOBW-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x4c,0xc0,0x10] +; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8mulb_128_mask: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc1] +; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X64NOBW-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x4c,0xc0,0x10] +; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> %2 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2) %3 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> %passthru @@ -231,17 +489,37 @@ } define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i16 %mask) { -; X86-LABEL: test_vgf2p8mulb_128_maskz: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8mulb_128_maskz: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xc1] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8mulb_128_maskz: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xc1] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8mulb_128_maskz: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xc1] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8mulb_128_maskz: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc1] +; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X86NOBW-NEXT: vpand %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xc0] +; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8mulb_128_maskz: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc1] +; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X64NOBW-NEXT: vpand %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xc0] +; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> %2 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2) %3 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> zeroinitializer @@ -259,19 +537,46 @@ } define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) { -; X86-LABEL: test_vgf2p8mulb_256_mask: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xcf,0xd1] -; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8mulb_256_mask: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xcf,0xd1] -; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8mulb_256_mask: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xcf,0xd1] +; X86BW-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8mulb_256_mask: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xcf,0xd1] +; X64BW-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8mulb_256_mask: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] +; X86NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1] +; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] +; X86NOBW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcb,0x01] +; X86NOBW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 # encoding: [0xc4,0xe3,0x6d,0x4c,0xc0,0x10] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8mulb_256_mask: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: shrl $16, %edi # encoding: [0xc1,0xef,0x10] +; X64NOBW-NEXT: kmovw %edi, %k2 # encoding: [0xc5,0xf8,0x92,0xd7] +; X64NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1] +; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] +; X64NOBW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcb,0x01] +; X64NOBW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 # encoding: [0xc4,0xe3,0x6d,0x4c,0xc0,0x10] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2) %3 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> %passthru @@ -279,17 +584,44 @@ } define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i32 %mask) { -; X86-LABEL: test_vgf2p8mulb_256_maskz: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8mulb_256_maskz: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xc1] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8mulb_256_maskz: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xc1] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8mulb_256_maskz: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xc1] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8mulb_256_maskz: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] +; X86NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1] +; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X86NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} # encoding: [0x62,0xf3,0x6d,0xca,0x25,0xd2,0xff] +; X86NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] +; X86NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] +; X86NOBW-NEXT: vpand %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xdb,0xc0] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8mulb_256_maskz: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: shrl $16, %edi # encoding: [0xc1,0xef,0x10] +; X64NOBW-NEXT: kmovw %edi, %k2 # encoding: [0xc5,0xf8,0x92,0xd7] +; X64NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1] +; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X64NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} # encoding: [0x62,0xf3,0x6d,0xca,0x25,0xd2,0xff] +; X64NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] +; X64NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] +; X64NOBW-NEXT: vpand %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xdb,0xc0] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2) %3 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> zeroinitializer @@ -307,19 +639,66 @@ } define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) { -; X86-LABEL: test_vgf2p8mulb_512_mask: -; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0xcf,0xd1] -; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8mulb_512_mask: -; X64: # %bb.0: -; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0xcf,0xd1] -; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8mulb_512_mask: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0xcf,0xd1] +; X86BW-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8mulb_512_mask: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0xcf,0xd1] +; X64BW-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8mulb_512_mask: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k3 # encoding: [0xc5,0xf8,0x90,0x5c,0x24,0x0a] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08] +; X86NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc9] +; X86NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} # encoding: [0x62,0xf3,0x7d,0xcc,0x25,0xc0,0xff] +; X86NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] +; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} # encoding: [0x62,0xf3,0x65,0xcb,0x25,0xdb,0xff] +; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] +; X86NOBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01] +; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff] +; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] +; X86NOBW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} # encoding: [0x62,0xf3,0x5d,0xca,0x25,0xe4,0xff] +; X86NOBW-NEXT: vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4] +; X86NOBW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01] +; X86NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 # encoding: [0x62,0xf3,0xe5,0x48,0x3a,0xc0,0x01] +; X86NOBW-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0xca] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8mulb_512_mask: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: movq %rdi, %rax # encoding: [0x48,0x89,0xf8] +; X64NOBW-NEXT: movl %edi, %ecx # encoding: [0x89,0xf9] +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20] +; X64NOBW-NEXT: shrq $48, %rax # encoding: [0x48,0xc1,0xe8,0x30] +; X64NOBW-NEXT: shrl $16, %ecx # encoding: [0xc1,0xe9,0x10] +; X64NOBW-NEXT: kmovw %ecx, %k2 # encoding: [0xc5,0xf8,0x92,0xd1] +; X64NOBW-NEXT: kmovw %eax, %k3 # encoding: [0xc5,0xf8,0x92,0xd8] +; X64NOBW-NEXT: kmovw %edi, %k4 # encoding: [0xc5,0xf8,0x92,0xe7] +; X64NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc9] +; X64NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} # encoding: [0x62,0xf3,0x7d,0xcc,0x25,0xc0,0xff] +; X64NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] +; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} # encoding: [0x62,0xf3,0x65,0xcb,0x25,0xdb,0xff] +; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] +; X64NOBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01] +; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff] +; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] +; X64NOBW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} # encoding: [0x62,0xf3,0x5d,0xca,0x25,0xe4,0xff] +; X64NOBW-NEXT: vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4] +; X64NOBW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01] +; X64NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 # encoding: [0x62,0xf3,0xe5,0x48,0x3a,0xc0,0x01] +; X64NOBW-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0xca] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2) %3 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> %passthru @@ -327,17 +706,64 @@ } define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i64 %mask) { -; X86-LABEL: test_vgf2p8mulb_512_maskz: -; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_vgf2p8mulb_512_maskz: -; X64: # %bb.0: -; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xc1] -; X64-NEXT: retq # encoding: [0xc3] +; X86BW-LABEL: test_vgf2p8mulb_512_maskz: +; X86BW: # %bb.0: +; X86BW-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86BW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xc1] +; X86BW-NEXT: retl # encoding: [0xc3] +; +; X64BW-LABEL: test_vgf2p8mulb_512_maskz: +; X64BW: # %bb.0: +; X64BW-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64BW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xc1] +; X64BW-NEXT: retq # encoding: [0xc3] +; +; X86NOBW-LABEL: test_vgf2p8mulb_512_maskz: +; X86NOBW: # %bb.0: +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k3 # encoding: [0xc5,0xf8,0x90,0x5c,0x24,0x0a] +; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08] +; X86NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc1] +; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X86NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} # encoding: [0x62,0xf3,0x6d,0xcb,0x25,0xd2,0xff] +; X86NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] +; X86NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] +; X86NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x6d,0xc9,0x25,0xd2,0xff] +; X86NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] +; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] +; X86NOBW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x6d,0x38,0xd3,0x01] +; X86NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 # encoding: [0x62,0xf3,0xed,0x48,0x3a,0xc9,0x01] +; X86NOBW-NEXT: vpandq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xdb,0xc0] +; X86NOBW-NEXT: retl # encoding: [0xc3] +; +; X64NOBW-LABEL: test_vgf2p8mulb_512_maskz: +; X64NOBW: # %bb.0: +; X64NOBW-NEXT: movq %rdi, %rax # encoding: [0x48,0x89,0xf8] +; X64NOBW-NEXT: movl %edi, %ecx # encoding: [0x89,0xf9] +; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64NOBW-NEXT: shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20] +; X64NOBW-NEXT: shrq $48, %rax # encoding: [0x48,0xc1,0xe8,0x30] +; X64NOBW-NEXT: shrl $16, %ecx # encoding: [0xc1,0xe9,0x10] +; X64NOBW-NEXT: kmovw %ecx, %k2 # encoding: [0xc5,0xf8,0x92,0xd1] +; X64NOBW-NEXT: kmovw %eax, %k3 # encoding: [0xc5,0xf8,0x92,0xd8] +; X64NOBW-NEXT: kmovw %edi, %k4 # encoding: [0xc5,0xf8,0x92,0xe7] +; X64NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc1] +; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] +; X64NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} # encoding: [0x62,0xf3,0x6d,0xcb,0x25,0xd2,0xff] +; X64NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] +; X64NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] +; X64NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x6d,0xc9,0x25,0xd2,0xff] +; X64NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] +; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] +; X64NOBW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x6d,0x38,0xd3,0x01] +; X64NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 # encoding: [0x62,0xf3,0xed,0x48,0x3a,0xc9,0x01] +; X64NOBW-NEXT: vpandq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xdb,0xc0] +; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2) %3 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> zeroinitializer