diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -803,6 +803,7 @@ - ``-march=raptorlake`` and ``-march=meteorlake`` are now supported. - ``-march=sierraforest``, ``-march=graniterapids`` and ``-march=grandridge`` are now supported. - Lift _BitInt() supported max width from 128 to 8388608. +- Support *_reduce_*_ep[i|u]8/16 series intrinsics. WebAssembly Support in Clang ---------------------------- diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -2803,6 +2803,368 @@ (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ (__v16hi)_mm256_setzero_si256())) +/* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as + * outputs. This class of vector operation forms the basis of many scientific + * computations. In vector-reduction arithmetic, the evaluation off is + * independent of the order of the input elements of V. + + * Used bisection method. At each step, we partition the vector with previous + * step in half, and the operation is performed on its two halves. + * This takes log2(n) steps where n is the number of elements in the vector. + */ + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_reduce_add_epi16(__m128i __W) { + return __builtin_reduce_add((__v8hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_reduce_mul_epi16(__m128i __W) { + return __builtin_reduce_mul((__v8hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_reduce_and_epi16(__m128i __W) { + return __builtin_reduce_and((__v8hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_reduce_or_epi16(__m128i __W) { + return __builtin_reduce_or((__v8hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_add_epi16( __mmask8 __M, __m128i __W) { + __W = _mm_maskz_mov_epi16(__M, __W); + return __builtin_reduce_add((__v8hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_mul_epi16( __mmask8 __M, __m128i __W) { + __W = _mm_mask_mov_epi16(_mm_set1_epi16(1), __M, __W); + return __builtin_reduce_mul((__v8hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_and_epi16( __mmask8 __M, __m128i __W) { + __W = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __W); + return __builtin_reduce_and((__v8hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_or_epi16(__mmask8 __M, __m128i __W) { + __W = _mm_maskz_mov_epi16(__M, __W); + return __builtin_reduce_or((__v8hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_reduce_max_epi16(__m128i __V) { + return __builtin_reduce_max((__v8hi)__V); +} + +static __inline__ unsigned short __DEFAULT_FN_ATTRS128 +_mm_reduce_max_epu16(__m128i __V) { + return __builtin_reduce_max((__v8hu)__V); +} + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_reduce_min_epi16(__m128i __V) { + return __builtin_reduce_min((__v8hi)__V); +} + +static __inline__ unsigned short __DEFAULT_FN_ATTRS128 +_mm_reduce_min_epu16(__m128i __V) { + return __builtin_reduce_min((__v8hu)__V); +} + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_max_epi16(__mmask16 __M, __m128i __V) { + __V = _mm_mask_mov_epi16(_mm_set1_epi16(-32767-1), __M, __V); + return __builtin_reduce_max((__v8hi)__V); +} + +static __inline__ unsigned short __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_max_epu16(__mmask16 __M, __m128i __V) { + __V = _mm_maskz_mov_epi16(__M, __V); + return __builtin_reduce_max((__v8hu)__V); +} + +static __inline__ short __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_min_epi16(__mmask16 __M, __m128i __V) { + __V = _mm_mask_mov_epi16(_mm_set1_epi16(32767), __M, __V); + return __builtin_reduce_min((__v8hi)__V); +} + +static __inline__ unsigned short __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_min_epu16(__mmask16 __M, __m128i __V) { + __V = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __V); + return __builtin_reduce_min((__v8hu)__V); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_reduce_add_epi16(__m256i __W) { + return __builtin_reduce_add((__v16hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_reduce_mul_epi16(__m256i __W) { + return __builtin_reduce_mul((__v16hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_reduce_and_epi16(__m256i __W) { + return __builtin_reduce_and((__v16hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_reduce_or_epi16(__m256i __W) { + return __builtin_reduce_or((__v16hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_add_epi16( __mmask16 __M, __m256i __W) { + __W = _mm256_maskz_mov_epi16(__M, __W); + return __builtin_reduce_add((__v16hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_mul_epi16( __mmask16 __M, __m256i __W) { + __W = _mm256_mask_mov_epi16(_mm256_set1_epi16(1), __M, __W); + return __builtin_reduce_mul((__v16hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_and_epi16( __mmask16 __M, __m256i __W) { + __W = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __W); + return __builtin_reduce_and((__v16hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_or_epi16(__mmask16 __M, __m256i __W) { + __W = _mm256_maskz_mov_epi16(__M, __W); + return __builtin_reduce_or((__v16hi)__W); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_reduce_max_epi16(__m256i __V) { + return __builtin_reduce_max((__v16hi)__V); +} + +static __inline__ unsigned short __DEFAULT_FN_ATTRS256 +_mm256_reduce_max_epu16(__m256i __V) { + return __builtin_reduce_max((__v16hu)__V); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_reduce_min_epi16(__m256i __V) { + return __builtin_reduce_min((__v16hi)__V); +} + +static __inline__ unsigned short __DEFAULT_FN_ATTRS256 +_mm256_reduce_min_epu16(__m256i __V) { + return __builtin_reduce_min((__v16hu)__V); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_max_epi16(__mmask16 __M, __m256i __V) { + __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-32767-1), __M, __V); + return __builtin_reduce_max((__v16hi)__V); +} + +static __inline__ unsigned short __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_max_epu16(__mmask16 __M, __m256i __V) { + __V = _mm256_maskz_mov_epi16(__M, __V); + return __builtin_reduce_max((__v16hu)__V); +} + +static __inline__ short __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_min_epi16(__mmask16 __M, __m256i __V) { + __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(32767), __M, __V); + return __builtin_reduce_min((__v16hi)__V); +} + +static __inline__ unsigned short __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_min_epu16(__mmask16 __M, __m256i __V) { + __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __V); + return __builtin_reduce_min((__v16hu)__V); +} + +static __inline__ char __DEFAULT_FN_ATTRS128 +_mm_reduce_add_epi8(__m128i __W) { + return __builtin_reduce_add((__v16qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS128 +_mm_reduce_mul_epi8(__m128i __W) { + return __builtin_reduce_mul((__v16qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS128 +_mm_reduce_and_epi8(__m128i __W) { + return __builtin_reduce_and((__v16qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS128 +_mm_reduce_or_epi8(__m128i __W) { + return __builtin_reduce_or((__v16qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_add_epi8( __mmask16 __M, __m128i __W) { + __W = _mm_maskz_mov_epi8(__M, __W); + return __builtin_reduce_add((__v16qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_mul_epi8( __mmask16 __M, __m128i __W) { + __W = _mm_mask_mov_epi8(_mm_set1_epi8(1), __M, __W); + return __builtin_reduce_mul((__v16qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_and_epi8( __mmask16 __M, __m128i __W) { + __W = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __W); + return __builtin_reduce_and((__v16qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_or_epi8(__mmask16 __M, __m128i __W) { + __W = _mm_maskz_mov_epi8(__M, __W); + return __builtin_reduce_or((__v16qi)__W); +} + +static __inline__ signed char __DEFAULT_FN_ATTRS128 +_mm_reduce_max_epi8(__m128i __V) { + return __builtin_reduce_max((__v16qi)__V); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS128 +_mm_reduce_max_epu8(__m128i __V) { + return __builtin_reduce_max((__v16qu)__V); +} + +static __inline__ signed char __DEFAULT_FN_ATTRS128 +_mm_reduce_min_epi8(__m128i __V) { + return __builtin_reduce_min((__v16qi)__V); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS128 +_mm_reduce_min_epu8(__m128i __V) { + return __builtin_reduce_min((__v16qu)__V); +} + +static __inline__ signed char __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_max_epi8(__mmask16 __M, __m128i __V) { + __V = _mm_mask_mov_epi8(_mm_set1_epi8(-127-1), __M, __V); + return __builtin_reduce_max((__v16qi)__V); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_max_epu8(__mmask16 __M, __m128i __V) { + __V = _mm_maskz_mov_epi8(__M, __V); + return __builtin_reduce_max((__v16qu)__V); +} + +static __inline__ signed char __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_min_epi8(__mmask16 __M, __m128i __V) { + __V = _mm_mask_mov_epi8(_mm_set1_epi8(127), __M, __V); + return __builtin_reduce_min((__v16qi)__V); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS128 +_mm_mask_reduce_min_epu8(__mmask16 __M, __m128i __V) { + __V = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __V); + return __builtin_reduce_min((__v16qu)__V); +} + +static __inline__ char __DEFAULT_FN_ATTRS256 +_mm256_reduce_add_epi8(__m256i __W) { + return __builtin_reduce_add((__v32qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS256 +_mm256_reduce_mul_epi8(__m256i __W) { + return __builtin_reduce_mul((__v32qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS256 +_mm256_reduce_and_epi8(__m256i __W) { + return __builtin_reduce_and((__v32qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS256 +_mm256_reduce_or_epi8(__m256i __W) { + return __builtin_reduce_or((__v32qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_add_epi8( __mmask32 __M, __m256i __W) { + __W = _mm256_maskz_mov_epi8(__M, __W); + return __builtin_reduce_add((__v32qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_mul_epi8( __mmask32 __M, __m256i __W) { + __W = _mm256_mask_mov_epi8(_mm256_set1_epi8(1), __M, __W); + return __builtin_reduce_mul((__v32qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_and_epi8( __mmask32 __M, __m256i __W) { + __W = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __W); + return __builtin_reduce_and((__v32qi)__W); +} + +static __inline__ char __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_or_epi8(__mmask32 __M, __m256i __W) { + __W = _mm256_maskz_mov_epi8(__M, __W); + return __builtin_reduce_or((__v32qi)__W); +} + +static __inline__ signed char __DEFAULT_FN_ATTRS256 +_mm256_reduce_max_epi8(__m256i __V) { + return __builtin_reduce_max((__v32qi)__V); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS256 +_mm256_reduce_max_epu8(__m256i __V) { + return __builtin_reduce_max((__v32qu)__V); +} + +static __inline__ signed char __DEFAULT_FN_ATTRS256 +_mm256_reduce_min_epi8(__m256i __V) { + return __builtin_reduce_min((__v32qi)__V); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS256 +_mm256_reduce_min_epu8(__m256i __V) { + return __builtin_reduce_min((__v32qu)__V); +} + +static __inline__ signed char __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_max_epi8(__mmask32 __M, __m256i __V) { + __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-127-1), __M, __V); + return __builtin_reduce_max((__v32qi)__V); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_max_epu8(__mmask32 __M, __m256i __V) { + __V = _mm256_maskz_mov_epi8(__M, __V); + return __builtin_reduce_max((__v32qu)__V); +} + +static __inline__ signed char __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_min_epi8(__mmask32 __M, __m256i __V) { + __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(127), __M, __V); + return __builtin_reduce_min((__v32qi)__V); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS256 +_mm256_mask_reduce_min_epu8(__mmask32 __M, __m256i __V) { + __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __V); + return __builtin_reduce_min((__v32qu)__V); +} + #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 diff --git a/clang/test/CodeGen/X86/avx512vlbw-reduceIntrin.c b/clang/test/CodeGen/X86/avx512vlbw-reduceIntrin.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/X86/avx512vlbw-reduceIntrin.c @@ -0,0 +1,211 @@ +// RUN: %clang_cc1 -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s + +#include + +short test_mm_reduce_add_epi16(__m128i __W){ +// CHECK-LABEL: @test_mm_reduce_add_epi16( +// CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %{{.*}}) + return _mm_reduce_add_epi16(__W); +} + +short test_mm_reduce_mul_epi16(__m128i __W){ +// CHECK-LABEL: @test_mm_reduce_mul_epi16( +// CHECK: call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %{{.*}}) + return _mm_reduce_mul_epi16(__W); +} + +short test_mm_reduce_or_epi16(__m128i __W){ +// CHECK-LABEL: @test_mm_reduce_or_epi16( +// CHECK: call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %{{.*}}) + return _mm_reduce_or_epi16(__W); +} + +short test_mm_reduce_and_epi16(__m128i __W){ +// CHECK-LABEL: @test_mm_reduce_and_epi16( +// CHECK: call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %{{.*}}) + return _mm_reduce_and_epi16(__W); +} + +short test_mm_mask_reduce_add_epi16(__mmask8 __M, __m128i __W){ +// CHECK-LABEL: @test_mm_mask_reduce_add_epi16( +// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %{{.*}}) + return _mm_mask_reduce_add_epi16(__M, __W); +} + +short test_mm_mask_reduce_mul_epi16(__mmask8 __M, __m128i __W){ +// CHECK-LABEL: @test_mm_mask_reduce_mul_epi16( +// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %{{.*}}) + return _mm_mask_reduce_mul_epi16(__M, __W); +} + +short test_mm_mask_reduce_and_epi16(__mmask8 __M, __m128i __W){ +// CHECK-LABEL: @test_mm_mask_reduce_and_epi16( +// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %{{.*}} + return _mm_mask_reduce_and_epi16(__M, __W); +} + +short test_mm_mask_reduce_or_epi16(__mmask8 __M, __m128i __W){ +// CHECK-LABEL: @test_mm_mask_reduce_or_epi16( +// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %{{.*}}) + return _mm_mask_reduce_or_epi16(__M, __W); +} + +short test_mm256_reduce_add_epi16(__m256i __W){ +// CHECK-LABEL: @test_mm256_reduce_add_epi16( +// CHECK: call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %{{.*}}) + return _mm256_reduce_add_epi16(__W); +} + +short test_mm256_reduce_mul_epi16(__m256i __W){ +// CHECK-LABEL: @test_mm256_reduce_mul_epi16( +// CHECK: call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %{{.*}}) + return _mm256_reduce_mul_epi16(__W); +} + +short test_mm256_reduce_or_epi16(__m256i __W){ +// CHECK-LABEL: @test_mm256_reduce_or_epi16( +// CHECK: call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %{{.*}}) + return _mm256_reduce_or_epi16(__W); +} + +short test_mm256_reduce_and_epi16(__m256i __W){ +// CHECK-LABEL: @test_mm256_reduce_and_epi16( +// CHECK: call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %{{.*}}) + return _mm256_reduce_and_epi16(__W); +} + +short test_mm256_mask_reduce_add_epi16(__mmask16 __M, __m256i __W){ +// CHECK-LABEL: @test_mm256_mask_reduce_add_epi16( +// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %{{.*}}) + return _mm256_mask_reduce_add_epi16(__M, __W); +} + +short test_mm256_mask_reduce_mul_epi16(__mmask16 __M, __m256i __W){ +// CHECK-LABEL: @test_mm256_mask_reduce_mul_epi16( +// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %{{.*}}) + return _mm256_mask_reduce_mul_epi16(__M, __W); +} + +short test_mm256_mask_reduce_and_epi16(__mmask16 __M, __m256i __W){ +// CHECK-LABEL: @test_mm256_mask_reduce_and_epi16( +// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %{{.*}}) + return _mm256_mask_reduce_and_epi16(__M, __W); +} + +short test_mm256_mask_reduce_or_epi16(__mmask16 __M, __m256i __W){ +// CHECK-LABEL: @test_mm256_mask_reduce_or_epi16( +// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %{{.*}}) + return _mm256_mask_reduce_or_epi16(__M, __W); +} + +char test_mm_reduce_add_epi8(__m128i __W){ +// CHECK-LABEL: @test_mm_reduce_add_epi8( +// CHECK: call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %{{.*}}) + return _mm_reduce_add_epi8(__W); +} + +char test_mm_reduce_mul_epi8(__m128i __W){ +// CHECK-LABEL: @test_mm_reduce_mul_epi8( +// CHECK: call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %{{.*}}) + return _mm_reduce_mul_epi8(__W); +} + +char test_mm_reduce_and_epi8(__m128i __W){ +// CHECK-LABEL: @test_mm_reduce_and_epi8( +// CHECK: call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %{{.*}}) + return _mm_reduce_and_epi8(__W); +} + +char test_mm_reduce_or_epi8(__m128i __W){ +// CHECK-LABEL: @test_mm_reduce_or_epi8( +// CHECK: call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %{{.*}}) + return _mm_reduce_or_epi8(__W); +} + +char test_mm_mask_reduce_add_epi8(__mmask16 __M, __m128i __W){ +// CHECK-LABEL: @test_mm_mask_reduce_add_epi8( +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %{{.*}}) + return _mm_mask_reduce_add_epi8(__M, __W); +} + +char test_mm_mask_reduce_mul_epi8(__mmask16 __M, __m128i __W){ +// CHECK-LABEL: @test_mm_mask_reduce_mul_epi8( +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %{{.*}}) + return _mm_mask_reduce_mul_epi8(__M, __W); +} + +char test_mm_mask_reduce_and_epi8(__mmask16 __M, __m128i __W){ +// CHECK-LABEL: @test_mm_mask_reduce_and_epi8( +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %{{.*}}) + return _mm_mask_reduce_and_epi8(__M, __W); +} + +char test_mm_mask_reduce_or_epi8(__mmask16 __M, __m128i __W){ +// CHECK-LABEL: @test_mm_mask_reduce_or_epi8( +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %{{.*}}) + return _mm_mask_reduce_or_epi8(__M, __W); +} + +char test_mm256_reduce_add_epi8(__m256i __W){ +// CHECK-LABEL: @test_mm256_reduce_add_epi8( +// CHECK: call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %{{.*}}) + return _mm256_reduce_add_epi8(__W); +} + +char test_mm256_reduce_mul_epi8(__m256i __W){ +// CHECK-LABEL: @test_mm256_reduce_mul_epi8( +// CHECK: call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %{{.*}}) + return _mm256_reduce_mul_epi8(__W); +} + +char test_mm256_reduce_and_epi8(__m256i __W){ +// CHECK-LABEL: @test_mm256_reduce_and_epi8( +// CHECK: call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %{{.*}}) + return _mm256_reduce_and_epi8(__W); +} + +char test_mm256_reduce_or_epi8(__m256i __W){ +// CHECK-LABEL: @test_mm256_reduce_or_epi8( +// CHECK: call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %{{.*}}) + return _mm256_reduce_or_epi8(__W); +} + +char test_mm256_mask_reduce_add_epi8(__mmask32 __M, __m256i __W){ +// CHECK-LABEL: @test_mm256_mask_reduce_add_epi8( +// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %{{.*}}) + return _mm256_mask_reduce_add_epi8(__M, __W); +} + +char test_mm256_mask_reduce_mul_epi8(__mmask32 __M, __m256i __W){ +// CHECK-LABEL: @test_mm256_mask_reduce_mul_epi8( +// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %{{.*}}) + return _mm256_mask_reduce_mul_epi8(__M, __W); +} + +char test_mm256_mask_reduce_and_epi8(__mmask32 __M, __m256i __W){ +// CHECK-LABEL: @test_mm256_mask_reduce_and_epi8( +// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %{{.*}}) + return _mm256_mask_reduce_and_epi8(__M, __W); +} + +char test_mm256_mask_reduce_or_epi8(__mmask32 __M, __m256i __W){ +// CHECK-LABEL: @test_mm256_mask_reduce_or_epi8( +// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %{{.*}}) + return _mm256_mask_reduce_or_epi8(__M, __W); +} \ No newline at end of file diff --git a/clang/test/CodeGen/X86/avx512vlbw-reduceMinMaxIntrin.c b/clang/test/CodeGen/X86/avx512vlbw-reduceMinMaxIntrin.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/X86/avx512vlbw-reduceMinMaxIntrin.c @@ -0,0 +1,211 @@ +// RUN: %clang_cc1 -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s + +#include + +short test_mm_reduce_max_epi16(__m128i __W){ +// CHECK-LABEL: test_mm_reduce_max_epi16 +// CHECK: call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %{{.*}}) + return _mm_reduce_max_epi16(__W); +} + +short test_mm_reduce_min_epi16(__m128i __W){ +// CHECK-LABEL: test_mm_reduce_min_epi16 +// CHECK: call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %{{.*}}) + return _mm_reduce_min_epi16(__W); +} + +unsigned short test_mm_reduce_max_epu16(__m128i __W){ +// CHECK-LABEL: test_mm_reduce_max_epu16 +// CHECK: call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %{{.*}}) + return _mm_reduce_max_epu16(__W); +} + +unsigned short test_mm_reduce_min_epu16(__m128i __W){ +// CHECK-LABEL: test_mm_reduce_min_epu16 +// CHECK: call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %{{.*}}) + return _mm_reduce_min_epu16(__W); +} + +short test_mm_mask_reduce_max_epi16(__mmask8 __M, __m128i __W){ +// CHECK-LABEL: test_mm_mask_reduce_max_epi16 +// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %{{.*}}) + return _mm_mask_reduce_max_epi16(__M, __W); +} + +short test_mm_mask_reduce_min_epi16(__mmask8 __M, __m128i __W){ +// CHECK-LABEL: test_mm_mask_reduce_min_epi16 +// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %{{.*}}) + return _mm_mask_reduce_min_epi16(__M, __W); +} + +unsigned short test_mm_mask_reduce_max_epu16(__mmask8 __M, __m128i __W){ +// CHECK-LABEL: test_mm_mask_reduce_max_epu16 +// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %{{.*}}) + return _mm_mask_reduce_max_epu16(__M, __W); +} + +unsigned short test_mm_mask_reduce_min_epu16(__mmask8 __M, __m128i __W){ +// CHECK-LABEL: test_mm_mask_reduce_min_epu16 +// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %{{.*}}) + return _mm_mask_reduce_min_epu16(__M, __W); +} + +short test_mm256_reduce_max_epi16(__m256i __W){ +// CHECK-LABEL: test_mm256_reduce_max_epi16 +// CHECK: call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %{{.*}}) + return _mm256_reduce_max_epi16(__W); +} + +short test_mm256_reduce_min_epi16(__m256i __W){ +// CHECK-LABEL: test_mm256_reduce_min_epi16 +// CHECK: call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %{{.*}}) + return _mm256_reduce_min_epi16(__W); +} + +unsigned short test_mm256_reduce_max_epu16(__m256i __W){ +// CHECK-LABEL: test_mm256_reduce_max_epu16 +// CHECK: call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %{{.*}}) + return _mm256_reduce_max_epu16(__W); +} + +unsigned short test_mm256_reduce_min_epu16(__m256i __W){ +// CHECK-LABEL: test_mm256_reduce_min_epu16 +// CHECK: call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %{{.*}}) + return _mm256_reduce_min_epu16(__W); +} + +short test_mm256_mask_reduce_max_epi16(__mmask16 __M, __m256i __W){ +// CHECK-LABEL: test_mm256_mask_reduce_max_epi16 +// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %{{.*}}) + return _mm256_mask_reduce_max_epi16(__M, __W); +} + +short test_mm256_mask_reduce_min_epi16(__mmask16 __M, __m256i __W){ +// CHECK-LABEL: test_mm256_mask_reduce_min_epi16 +// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %{{.*}}) + return _mm256_mask_reduce_min_epi16(__M, __W); +} + +unsigned short test_mm256_mask_reduce_max_epu16(__mmask16 __M, __m256i __W){ +// CHECK-LABEL: test_mm256_mask_reduce_max_epu16 +// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %{{.*}}) + return _mm256_mask_reduce_max_epu16(__M, __W); +} + +unsigned short test_mm256_mask_reduce_min_epu16(__mmask16 __M, __m256i __W){ +// CHECK-LABEL: test_mm256_mask_reduce_min_epu16 +// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} +// CHECK: call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %{{.*}}) + return _mm256_mask_reduce_min_epu16(__M, __W); +} + +signed char test_mm_reduce_max_epi8(__m128i __W){ +// CHECK-LABEL: test_mm_reduce_max_epi8 +// CHECK: call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %{{.*}}) + return _mm_reduce_max_epi8(__W); +} + +signed char test_mm_reduce_min_epi8(__m128i __W){ +// CHECK-LABEL: test_mm_reduce_min_epi8 +// CHECK: call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %{{.*}}) + return _mm_reduce_min_epi8(__W); +} + +unsigned char test_mm_reduce_max_epu8(__m128i __W){ +// CHECK-LABEL: test_mm_reduce_max_epu8 +// CHECK: call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %{{.*}}) + return _mm_reduce_max_epu8(__W); +} + +unsigned char test_mm_reduce_min_epu8(__m128i __W){ +// CHECK-LABEL: test_mm_reduce_min_epu8 +// CHECK: call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %{{.*}}) + return _mm_reduce_min_epu8(__W); +} + +signed char test_mm_mask_reduce_max_epi8(__mmask16 __M, __m128i __W){ +// CHECK-LABEL: test_mm_mask_reduce_max_epi8 +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %{{.*}}) + return _mm_mask_reduce_max_epi8(__M, __W); +} + +signed char test_mm_mask_reduce_min_epi8(__mmask16 __M, __m128i __W){ +// CHECK-LABEL: test_mm_mask_reduce_min_epi8 +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %{{.*}}) + return _mm_mask_reduce_min_epi8(__M, __W); +} + +unsigned char test_mm_mask_reduce_max_epu8(__mmask16 __M, __m128i __W){ +// CHECK-LABEL: test_mm_mask_reduce_max_epu8 +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %{{.*}}) + return _mm_mask_reduce_max_epu8(__M, __W); +} + +unsigned char test_mm_mask_reduce_min_epu8(__mmask16 __M, __m128i __W){ +// CHECK-LABEL: test_mm_mask_reduce_min_epu8 +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %{{.*}}) + return _mm_mask_reduce_min_epu8(__M, __W); +} + +signed char test_mm256_reduce_max_epi8(__m256i __W){ +// CHECK-LABEL: test_mm256_reduce_max_epi8 +// CHECK: call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %{{.*}}) + return _mm256_reduce_max_epi8(__W); +} + +signed char test_mm256_reduce_min_epi8(__m256i __W){ +// CHECK-LABEL: test_mm256_reduce_min_epi8 +// CHECK: call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %{{.*}}) + return _mm256_reduce_min_epi8(__W); +} + +unsigned char test_mm256_reduce_max_epu8(__m256i __W){ +// CHECK-LABEL: test_mm256_reduce_max_epu8 +// CHECK: call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %{{.*}}) + return _mm256_reduce_max_epu8(__W); +} + +unsigned char test_mm256_reduce_min_epu8(__m256i __W){ +// CHECK-LABEL: test_mm256_reduce_min_epu8 +// CHECK: call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %{{.*}}) + return _mm256_reduce_min_epu8(__W); +} + +signed char test_mm256_mask_reduce_max_epi8(__mmask32 __M, __m256i __W){ +// CHECK-LABEL: test_mm256_mask_reduce_max_epi8 +// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %{{.*}}) + return _mm256_mask_reduce_max_epi8(__M, __W); +} + +signed char test_mm256_mask_reduce_min_epi8(__mmask32 __M, __m256i __W){ +// CHECK-LABEL: test_mm256_mask_reduce_min_epi8 +// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %{{.*}}) + return _mm256_mask_reduce_min_epi8(__M, __W); +} + +unsigned char test_mm256_mask_reduce_max_epu8(__mmask32 __M, __m256i __W){ +// CHECK-LABEL: test_mm256_mask_reduce_max_epu8 +// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %{{.*}}) + return _mm256_mask_reduce_max_epu8(__M, __W); +} + +unsigned char test_mm256_mask_reduce_min_epu8(__mmask32 __M, __m256i __W){ +// CHECK-LABEL: test_mm256_mask_reduce_min_epu8 +// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} +// CHECK: call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %{{.*}}) + return _mm256_mask_reduce_min_epu8(__M, __W); +} \ No newline at end of file