Index: lib/Headers/avx512fintrin.h =================================================================== --- lib/Headers/avx512fintrin.h +++ lib/Headers/avx512fintrin.h @@ -9660,6 +9660,245 @@ return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A); } +// Vector-reduction arithmetic accepts vectors as inputs and produces scalars as +// outputs. This class of vector operation forms the basis of many scientific +// computations. In vector-reduction arithmetic, the evaluation off is +// independent of the order of the input elements of V. + +// Used bisection method. At each step, we partition the vector with previous +// step in half, and the operation is performed on its two halves. +// This takes log2(n) steps where n is the number of elements in the vector. + +// Vec512 - Vector with size 512. +// Operator - Can be one of following: +,*,&&,|| +// FloatOrInt - Can get 'i' for int and 'f' for float. +// VectorType - Can get 'i' for int and 'd' for double. + +#define _mm512_reduce_operator_double(Vec512, Operator, FloatOrInt, \ + VectorType) \ + __extension__({ \ + __m256##VectorType Vec256 = __builtin_shufflevector( \ + (__v8d##FloatOrInt)Vec512, (__v8d##FloatOrInt)Vec512, 0, 1, 2, 3) \ + Operator __builtin_shufflevector( \ + (__v8d##FloatOrInt)Vec512, (__v8d##FloatOrInt)Vec512, 4, 5, 6, 7); \ + __m128##VectorType Vec128 = __builtin_shufflevector( \ + (__v4d##FloatOrInt)Vec256, (__v4d##FloatOrInt)Vec256, 0, 1) \ + Operator __builtin_shufflevector((__v4d##FloatOrInt)Vec256, \ + (__v4d##FloatOrInt)Vec256, 2, 3); \ + Vec128 = __builtin_shufflevector((__v2d##FloatOrInt)Vec128, \ + (__v2d##FloatOrInt)Vec128, 0, 1) \ + Operator __builtin_shufflevector((__v2d##FloatOrInt)Vec128, \ + (__v2d##FloatOrInt)Vec128, 1, 0); \ + return Vec128[0]; \ + }) + +static __inline__ long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) { + _mm512_reduce_operator_double(__W, +, i, i); +} + +static __inline__ long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) { + _mm512_reduce_operator_double(__W, *, i, i); +} + +static __inline__ long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) { + _mm512_reduce_operator_double(__W, &, i, i); +} + +static __inline__ long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) { + _mm512_reduce_operator_double(__W, |, i, i); +} + +static __inline__ long __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) { + _mm512_reduce_operator_double(__W, +, f, d); +} + +static __inline__ long __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) { + _mm512_reduce_operator_double(__W, *, f, d); +} + +// Vec512 - Vector with size 512. +// Operator - Can be one of following: +,*,&&,|| +// Mask - Intrinsic Mask +// Neutral - Identity element: {+,0},{*,1},{&&,0xFFFFFFFF},{||,0} +// FloatOrInt - Can get 'i' for int and 'f' for float. +// VectorType - Can get 'i' for int and 'd' for packed double-precision. +// PdOrq - Can be Pd for packed double or q for q-word. + +#define _mm512_mask_reduce_operator_double(Vec512, Operator, Mask, Neutral, \ + FloatOrInt, VectorType, PdOrq) \ + __extension__({ \ + Vec512 = __builtin_ia32_select##PdOrq##_512( \ + (__mmask8)Mask, (__v8d##FloatOrInt)Vec512, \ + (__v8d##FloatOrInt)_mm512_set1_epi64(Neutral)); \ + __m256##VectorType Vec256 = __builtin_shufflevector( \ + (__v8d##FloatOrInt)Vec512, (__v8d##FloatOrInt)Vec512, 0, 1, 2, 3) \ + Operator __builtin_shufflevector( \ + (__v8d##FloatOrInt)Vec512, (__v8d##FloatOrInt)Vec512, 4, 5, 6, 7); \ + __m128##VectorType Vec128 = __builtin_shufflevector( \ + (__v4d##FloatOrInt)Vec256, (__v4d##FloatOrInt)Vec256, 0, 1) \ + Operator __builtin_shufflevector((__v4d##FloatOrInt)Vec256, \ + (__v4d##FloatOrInt)Vec256, 2, 3); \ + Vec128 = __builtin_shufflevector((__v2d##FloatOrInt)Vec128, \ + (__v2d##FloatOrInt)Vec128, 0, 1) \ + Operator __builtin_shufflevector((__v2d##FloatOrInt)Vec128, \ + (__v2d##FloatOrInt)Vec128, 1, 0); \ + return Vec128[0]; \ + }) + +static __inline__ long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_epi64(__m512i __W, __mmask8 __M) { + _mm512_mask_reduce_operator_double(__W, +, __M, 0, i, i, q); +} + +static __inline__ long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_epi64(__m512i __W, __mmask8 __M) { + _mm512_mask_reduce_operator_double(__W, *, __M, 1, i, i, q); +} + +static __inline__ long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_and_epi64(__m512i __W, __mmask8 __M) { + _mm512_mask_reduce_operator_double(__W, &, __M, 0xFFFFFFFF, i, i, q); +} + +static __inline__ long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_or_epi64(__m512i __W, __mmask8 __M) { + _mm512_mask_reduce_operator_double(__W, |, __M, 0, i, i, q); +} + +static __inline__ long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_pd(__m512d __W, __mmask8 __M) { + _mm512_mask_reduce_operator_double(__W, +, __M, 0, f, d, pd); +} + +static __inline__ long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_pd(__m512d __W, __mmask8 __M) { + _mm512_mask_reduce_operator_double(__W, *, __M, 1, f, d, pd); +} + +// Vec512 - Vector with size 512. +// Operator - Can be one of following: +,*,&&,|| +// FloatOrInt - Can get 'i' for int and ' ' for packed signal. +// VectorType - Can get 'i' for int and 'f' for float. + +#define _mm512_reduce_operator_singal(Vec512, Operator, FloatOrInt, \ + VectorType) \ + __extension__({ \ + __m256##VectorType Vec256 = (__m256##VectorType)__builtin_shufflevector( \ + (__v16s##FloatOrInt)Vec512, (__v16s##FloatOrInt)Vec512, 0, 1, 2, 3, 4, \ + 5, 6, 7) Operator(__m256##VectorType) \ + __builtin_shufflevector((__v16s##FloatOrInt)Vec512, \ + (__v16s##FloatOrInt)Vec512, 8, 9, 10, 11, 12, \ + 13, 14, 15); \ + __m128##VectorType Vec128 = (__m128##VectorType)__builtin_shufflevector( \ + (__v8s##FloatOrInt)Vec256, (__v8s##FloatOrInt)Vec256, 0, 1, 2, 3) \ + Operator(__m128##VectorType) __builtin_shufflevector( \ + (__v8s##FloatOrInt)Vec256, (__v8s##FloatOrInt)Vec256, 4, 5, 6, 7); \ + Vec128 = (__m128##VectorType)__builtin_shufflevector( \ + (__v4s##FloatOrInt)Vec128, (__v4s##FloatOrInt)Vec128, 0, 1, 2, 3) \ + Operator(__m128##VectorType) __builtin_shufflevector( \ + (__v4s##FloatOrInt)Vec128, (__v4s##FloatOrInt)Vec128, 2, 3, 0, 1); \ + Vec128 = (__m128##VectorType)__builtin_shufflevector( \ + (__v4s##FloatOrInt)Vec128, (__v4s##FloatOrInt)Vec128, 0, 1, 2, 3) \ + Operator(__m128##VectorType) __builtin_shufflevector( \ + (__v4s##FloatOrInt)Vec128, (__v4s##FloatOrInt)Vec128, 1, 0, 2, 3); \ + return Vec128[0]; \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_add_epi32(__m512i __W) { + _mm512_reduce_operator_singal(__W, +, i, i); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_mul_epi32(__m512i __W) { + _mm512_reduce_operator_singal(__W, *, i, i); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_and_epi32(__m512i __W) { + _mm512_reduce_operator_singal(__W, &, i, i); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_or_epi32(__m512i __W) { + _mm512_reduce_operator_singal(__W, |, i, i); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_add_ps(__m512 __W) { + _mm512_reduce_operator_singal(__W, +, f, ); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_mul_ps(__m512 __W) { + _mm512_reduce_operator_singal(__W, *, f, ); +} + +// Vec512 - Vector with size 512. +// Operator - Can be one of following: +,*,&&,|| +// Mask - Intrinsic Mask +// Neutral - Identity element: {+,0},{*,1},{&&,0xFFFFFFFF},{||,0} +// FloatOrInt - Can get 'i' for int and 'f' for float. +// VectorType - Can get 'i' for int and 'd' for double. +// PsOrD - Can be Ps for packed singal or d for d-word. + +#define _mm512_mask_reduce_operator_singal(Vec512, Operator, Mask, Neutral, \ + FloatOrInt, VectorType, psOrD) \ + __extension__({ \ + Vec512 = (__m512##VectorType)__builtin_ia32_select##psOrD##_512( \ + (__mmask16)Mask, (__v16s##FloatOrInt)Vec512, \ + (__v16s##FloatOrInt)_mm512_set1_epi64(Neutral)); \ + __m256##VectorType Vec256 = (__m256##VectorType)__builtin_shufflevector( \ + (__v16s##FloatOrInt)Vec512, (__v16s##FloatOrInt)Vec512, 0, 1, 2, 3, 4, \ + 5, 6, 7) Operator(__m256##VectorType) \ + __builtin_shufflevector((__v16s##FloatOrInt)Vec512, \ + (__v16s##FloatOrInt)Vec512, 8, 9, 10, 11, 12, \ + 13, 14, 15); \ + __m128##VectorType Vec128 = (__m128##VectorType)__builtin_shufflevector( \ + (__v8s##FloatOrInt)Vec256, (__v8s##FloatOrInt)Vec256, 0, 1, 2, 3) \ + Operator(__m128##VectorType) __builtin_shufflevector( \ + (__v8s##FloatOrInt)Vec256, (__v8s##FloatOrInt)Vec256, 4, 5, 6, 7); \ + Vec128 = (__m128##VectorType)__builtin_shufflevector( \ + (__v4s##FloatOrInt)Vec128, (__v4s##FloatOrInt)Vec128, 0, 1, 2, 3) \ + Operator(__m128##VectorType) __builtin_shufflevector( \ + (__v4s##FloatOrInt)Vec128, (__v4s##FloatOrInt)Vec128, 2, 3, 0, 1); \ + Vec128 = (__m128##VectorType)__builtin_shufflevector( \ + (__v4s##FloatOrInt)Vec128, (__v4s##FloatOrInt)Vec128, 0, 1, 2, 3) \ + Operator(__m128##VectorType) __builtin_shufflevector( \ + (__v4s##FloatOrInt)Vec128, (__v4s##FloatOrInt)Vec128, 1, 0, 2, 3); \ + return Vec128[0]; \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_epi32(__m512i __W, __mmask16 __M) { + _mm512_mask_reduce_operator_singal(__W, +, __M, 0, i, i, d); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_epi32(__m512i __W, __mmask16 __M) { + _mm512_mask_reduce_operator_singal(__W, *, __M, 1, i, i, d); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_and_epi32(__m512i __W, __mmask16 __M) { + _mm512_mask_reduce_operator_singal(__W, &, __M, 0xFFFFFFFFFFFFFFFF, i, i, d); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_or_epi32(__m512i __W, __mmask16 __M) { + _mm512_mask_reduce_operator_singal(__W, |, __M, 0, i, i, d); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_ps(__m512 __W, __mmask16 __M) { + _mm512_mask_reduce_operator_singal(__W, +, __M, 0, f, , ps); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_ps(__m512 __W, __mmask16 __M) { + _mm512_mask_reduce_operator_singal(__W, *, __M, 1, f, , ps); +} + #undef __DEFAULT_FN_ATTRS #endif // __AVX512FINTRIN_H Index: test/CodeGen/avx512-reduceIntrin.c =================================================================== --- test/CodeGen/avx512-reduceIntrin.c +++ test/CodeGen/avx512-reduceIntrin.c @@ -0,0 +1,469 @@ +// RUN: %clang_cc1 -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s + +#include + +int test_mm512_reduce_add_epi64(__m512i __W){ + // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + // CHECK: %add.i = add <4 x i64> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> + // CHECK: %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> + // CHECK: %add7.i = add <2 x i64> %shuffle6.i, %add4.i + // CHECK: %vecext.i = extractelement <2 x i64> %add7.i, i32 0 + // CHECK: %conv = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv + return _mm512_reduce_add_epi64(__W); +} + +int test_mm512_reduce_mul_epi64(__m512i __W){ + // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + // CHECK: %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> + // CHECK: %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> + // CHECK: %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i + // CHECK: %vecext.i = extractelement <2 x i64> %mul7.i, i32 0 + // CHECK: %conv = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv + return _mm512_reduce_mul_epi64(__W); +} + +int test_mm512_reduce_or_epi64(__m512i __W){ + // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + // CHECK: %or.i = or <4 x i64> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> + // CHECK: %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> + // CHECK: %or7.i = or <2 x i64> %shuffle6.i, %or4.i + // CHECK: %vecext.i = extractelement <2 x i64> %or7.i, i32 0 + // CHECK: %conv = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv + return _mm512_reduce_or_epi64(__W); +} + +int test_mm512_reduce_and_epi64(__m512i __W){ + // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + // CHECK: %and.i = and <4 x i64> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> + // CHECK: %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> + // CHECK: %and7.i = and <2 x i64> %shuffle6.i, %and4.i + // CHECK: %vecext.i = extractelement <2 x i64> %and7.i, i32 0 + // CHECK: %conv = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv + return _mm512_reduce_and_epi64(__W); +} + +int test_mm512_mask_reduce_add_epi64(__m512i __W, __mmask8 __M){ + // CHECK: %0 = bitcast i8 %__M to <8 x i1> + // CHECK: %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer + // CHECK: %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> + // CHECK: %add.i = add <4 x i64> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> + // CHECK: %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> + // CHECK: %add7.i = add <2 x i64> %shuffle6.i, %add4.i + // CHECK: %vecext.i = extractelement <2 x i64> %add7.i, i32 0 + // CHECK: %conv = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv + return _mm512_mask_reduce_add_epi64(__W , __M); +} + +int test_mm512_mask_reduce_mul_epi64(__m512i __W, __mmask8 __M){ + // CHECK: %0 = bitcast i8 %__M to <8 x i1> + // CHECK: %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> + // CHECK: %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> + // CHECK: %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> + // CHECK: %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> + // CHECK: %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i + // CHECK: %vecext.i = extractelement <2 x i64> %mul7.i, i32 0 + // CHECK: %conv = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv + return _mm512_mask_reduce_mul_epi64(__W , __M); +} + +int test_mm512_mask_reduce_and_epi64(__m512i __W, __mmask8 __M){ + // CHECK: %0 = bitcast i8 %__M to <8 x i1> + // CHECK: %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> + // CHECK: %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> + // CHECK: %and.i = and <4 x i64> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> + // CHECK: %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> + // CHECK: %and7.i = and <2 x i64> %shuffle6.i, %and4.i + // CHECK: %vecext.i = extractelement <2 x i64> %and7.i, i32 0 + // CHECK: %conv = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv + return _mm512_mask_reduce_and_epi64(__W , __M); +} + +int test_mm512_mask_reduce_or_epi64(__m512i __W, __mmask8 __M){ + // CHECK: %0 = bitcast i8 %__M to <8 x i1> + // CHECK: %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer + // CHECK: %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> + // CHECK: %or.i = or <4 x i64> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> + // CHECK: %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> + // CHECK: %or7.i = or <2 x i64> %shuffle6.i, %or4.i + // CHECK: %vecext.i = extractelement <2 x i64> %or7.i, i32 0 + // CHECK: %conv = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv + return _mm512_mask_reduce_or_epi64(__W , __M); +} + +int test_mm512_reduce_add_epi32(__m512i __W){ + // CHECK: %0 = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> + // CHECK: %1 = bitcast <8 x i32> %shuffle.i to <4 x i64> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> + // CHECK: %2 = bitcast <8 x i32> %shuffle1.i to <4 x i64> + // CHECK: %add.i = add <4 x i64> %1, %2 + // CHECK: %3 = bitcast <4 x i64> %add.i to <8 x i32> + // CHECK: %shuffle2.i = shufflevector <8 x i32> %3, <8 x i32> undef, <4 x i32> + // CHECK: %4 = bitcast <4 x i32> %shuffle2.i to <2 x i64> + // CHECK: %shuffle3.i = shufflevector <8 x i32> %3, <8 x i32> undef, <4 x i32> + // CHECK: %5 = bitcast <4 x i32> %shuffle3.i to <2 x i64> + // CHECK: %add4.i = add <2 x i64> %4, %5 + // CHECK: %6 = bitcast <2 x i64> %add4.i to <4 x i32> + // CHECK: %shuffle6.i = shufflevector <4 x i32> %6, <4 x i32> undef, <4 x i32> + // CHECK: %7 = bitcast <4 x i32> %shuffle6.i to <2 x i64> + // CHECK: %add7.i = add <2 x i64> %7, %add4.i + // CHECK: %8 = bitcast <2 x i64> %add7.i to <4 x i32> + // CHECK: %shuffle9.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> + // CHECK: %9 = bitcast <4 x i32> %shuffle9.i to <2 x i64> + // CHECK: %add10.i = add <2 x i64> %9, %add7.i + // CHECK: %vecext.i = extractelement <2 x i64> %add10.i, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_reduce_add_epi32(__W); +} + +int test_mm512_reduce_mul_epi32(__m512i __W){ + // CHECK: %0 = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> + // CHECK: %1 = bitcast <8 x i32> %shuffle.i to <4 x i64> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> + // CHECK: %2 = bitcast <8 x i32> %shuffle1.i to <4 x i64> + // CHECK: %mul.i = mul <4 x i64> %1, %2 + // CHECK: %3 = bitcast <4 x i64> %mul.i to <8 x i32> + // CHECK: %shuffle2.i = shufflevector <8 x i32> %3, <8 x i32> undef, <4 x i32> + // CHECK: %4 = bitcast <4 x i32> %shuffle2.i to <2 x i64> + // CHECK: %shuffle3.i = shufflevector <8 x i32> %3, <8 x i32> undef, <4 x i32> + // CHECK: %5 = bitcast <4 x i32> %shuffle3.i to <2 x i64> + // CHECK: %mul4.i = mul <2 x i64> %4, %5 + // CHECK: %6 = bitcast <2 x i64> %mul4.i to <4 x i32> + // CHECK: %shuffle6.i = shufflevector <4 x i32> %6, <4 x i32> undef, <4 x i32> + // CHECK: %7 = bitcast <4 x i32> %shuffle6.i to <2 x i64> + // CHECK: %mul7.i = mul <2 x i64> %7, %mul4.i + // CHECK: %8 = bitcast <2 x i64> %mul7.i to <4 x i32> + // CHECK: %shuffle9.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> + // CHECK: %9 = bitcast <4 x i32> %shuffle9.i to <2 x i64> + // CHECK: %mul10.i = mul <2 x i64> %9, %mul7.i + // CHECK: %vecext.i = extractelement <2 x i64> %mul10.i, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_reduce_mul_epi32(__W); +} + +int test_mm512_reduce_or_epi32(__m512i __W){ + // CHECK: %0 = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> + // CHECK: %or27.i = or <8 x i32> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <8 x i32> %or27.i, <8 x i32> undef, <4 x i32> + // CHECK: %shuffle3.i = shufflevector <8 x i32> %or27.i, <8 x i32> undef, <4 x i32> + // CHECK: %or428.i = or <4 x i32> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <4 x i32> %or428.i, <4 x i32> undef, <4 x i32> + // CHECK: %or729.i = or <4 x i32> %shuffle6.i, %or428.i + // CHECK: %shuffle9.i = shufflevector <4 x i32> %or729.i, <4 x i32> undef, <4 x i32> + // CHECK: %or1030.i = or <4 x i32> %shuffle9.i, %or729.i + // CHECK: %or10.i = bitcast <4 x i32> %or1030.i to <2 x i64> + // CHECK: %vecext.i = extractelement <2 x i64> %or10.i, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_reduce_or_epi32(__W); +} + +int test_mm512_reduce_and_epi32(__m512i __W){ + // CHECK: %0 = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> + // CHECK: %and27.i = and <8 x i32> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <8 x i32> %and27.i, <8 x i32> undef, <4 x i32> + // CHECK: %shuffle3.i = shufflevector <8 x i32> %and27.i, <8 x i32> undef, <4 x i32> + // CHECK: %and428.i = and <4 x i32> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <4 x i32> %and428.i, <4 x i32> undef, <4 x i32> + // CHECK: %and729.i = and <4 x i32> %shuffle6.i, %and428.i + // CHECK: %shuffle9.i = shufflevector <4 x i32> %and729.i, <4 x i32> undef, <4 x i32> + // CHECK: %and1030.i = and <4 x i32> %shuffle9.i, %and729.i + // CHECK: %and10.i = bitcast <4 x i32> %and1030.i to <2 x i64> + // CHECK: %vecext.i = extractelement <2 x i64> %and10.i, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_reduce_and_epi32(__W); +} + +int test_mm512_mask_reduce_add_epi32(__m512i __W, __mmask8 __M){ + // CHECK: %conv = zext i8 %__M to i16 + // CHECK: %0 = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %1 = bitcast i16 %conv to <16 x i1> + // CHECK: %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer + // CHECK: %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> + // CHECK: %3 = bitcast <8 x i32> %shuffle.i to <4 x i64> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> + // CHECK: %4 = bitcast <8 x i32> %shuffle1.i to <4 x i64> + // CHECK: %add.i = add <4 x i64> %3, %4 + // CHECK: %5 = bitcast <4 x i64> %add.i to <8 x i32> + // CHECK: %shuffle2.i = shufflevector <8 x i32> %5, <8 x i32> undef, <4 x i32> + // CHECK: %6 = bitcast <4 x i32> %shuffle2.i to <2 x i64> + // CHECK: %shuffle3.i = shufflevector <8 x i32> %5, <8 x i32> undef, <4 x i32> + // CHECK: %7 = bitcast <4 x i32> %shuffle3.i to <2 x i64> + // CHECK: %add4.i = add <2 x i64> %6, %7 + // CHECK: %8 = bitcast <2 x i64> %add4.i to <4 x i32> + // CHECK: %shuffle6.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> + // CHECK: %9 = bitcast <4 x i32> %shuffle6.i to <2 x i64> + // CHECK: %add7.i = add <2 x i64> %9, %add4.i + // CHECK: %10 = bitcast <2 x i64> %add7.i to <4 x i32> + // CHECK: %shuffle9.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> + // CHECK: %11 = bitcast <4 x i32> %shuffle9.i to <2 x i64> + // CHECK: %add10.i = add <2 x i64> %11, %add7.i + // CHECK: %vecext.i = extractelement <2 x i64> %add10.i, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_mask_reduce_add_epi32(__W , __M); +} + +int test_mm512_mask_reduce_mul_epi32(__m512i __W, __mmask8 __M){ + // CHECK: %conv = zext i8 %__M to i16 + // CHECK: %0 = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %1 = bitcast i16 %conv to <16 x i1> + // CHECK: %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> + // CHECK: %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> + // CHECK: %3 = bitcast <8 x i32> %shuffle.i to <4 x i64> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> + // CHECK: %4 = bitcast <8 x i32> %shuffle1.i to <4 x i64> + // CHECK: %mul.i = mul <4 x i64> %3, %4 + // CHECK: %5 = bitcast <4 x i64> %mul.i to <8 x i32> + // CHECK: %shuffle2.i = shufflevector <8 x i32> %5, <8 x i32> undef, <4 x i32> + // CHECK: %6 = bitcast <4 x i32> %shuffle2.i to <2 x i64> + // CHECK: %shuffle3.i = shufflevector <8 x i32> %5, <8 x i32> undef, <4 x i32> + // CHECK: %7 = bitcast <4 x i32> %shuffle3.i to <2 x i64> + // CHECK: %mul4.i = mul <2 x i64> %6, %7 + // CHECK: %8 = bitcast <2 x i64> %mul4.i to <4 x i32> + // CHECK: %shuffle6.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> + // CHECK: %9 = bitcast <4 x i32> %shuffle6.i to <2 x i64> + // CHECK: %mul7.i = mul <2 x i64> %9, %mul4.i + // CHECK: %10 = bitcast <2 x i64> %mul7.i to <4 x i32> + // CHECK: %shuffle9.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> + // CHECK: %11 = bitcast <4 x i32> %shuffle9.i to <2 x i64> + // CHECK: %mul10.i = mul <2 x i64> %11, %mul7.i + // CHECK: %vecext.i = extractelement <2 x i64> %mul10.i, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_mask_reduce_mul_epi32(__W , __M); +} + +int test_mm512_mask_reduce_and_epi32(__m512i __W, __mmask8 __M){ + // CHECK: %conv = zext i8 %__M to i16 + // CHECK: %0 = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %1 = bitcast i16 %conv to <16 x i1> + // CHECK: %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> + // CHECK: %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> + // CHECK: %and28.i = and <8 x i32> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <8 x i32> %and28.i, <8 x i32> undef, <4 x i32> + // CHECK: %shuffle3.i = shufflevector <8 x i32> %and28.i, <8 x i32> undef, <4 x i32> + // CHECK: %and429.i = and <4 x i32> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <4 x i32> %and429.i, <4 x i32> undef, <4 x i32> + // CHECK: %and730.i = and <4 x i32> %shuffle6.i, %and429.i + // CHECK: %shuffle9.i = shufflevector <4 x i32> %and730.i, <4 x i32> undef, <4 x i32> + // CHECK: %and1031.i = and <4 x i32> %shuffle9.i, %and730.i + // CHECK: %and10.i = bitcast <4 x i32> %and1031.i to <2 x i64> + // CHECK: %vecext.i = extractelement <2 x i64> %and10.i, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_mask_reduce_and_epi32(__W , __M); +} + +int test_mm512_mask_reduce_or_epi32(__m512i __W, __mmask8 __M){ + // CHECK: %conv = zext i8 %__M to i16 + // CHECK: %0 = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %1 = bitcast i16 %conv to <16 x i1> + // CHECK: %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer + // CHECK: %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> + // CHECK: %or28.i = or <8 x i32> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <8 x i32> %or28.i, <8 x i32> undef, <4 x i32> + // CHECK: %shuffle3.i = shufflevector <8 x i32> %or28.i, <8 x i32> undef, <4 x i32> + // CHECK: %or429.i = or <4 x i32> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <4 x i32> %or429.i, <4 x i32> undef, <4 x i32> + // CHECK: %or730.i = or <4 x i32> %shuffle6.i, %or429.i + // CHECK: %shuffle9.i = shufflevector <4 x i32> %or730.i, <4 x i32> undef, <4 x i32> + // CHECK: %or1031.i = or <4 x i32> %shuffle9.i, %or730.i + // CHECK: %or10.i = bitcast <4 x i32> %or1031.i to <2 x i64> + // CHECK: %vecext.i = extractelement <2 x i64> %or10.i, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_mask_reduce_or_epi32(__W , __M); +} + +int test_mm512_reduce_add_pd(__m512d __W){ + // CHECK: %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> + // CHECK: %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> + // CHECK: %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> + // CHECK: %add7.i = fadd <2 x double> %add4.i, %shuffle6.i + // CHECK: %vecext.i = extractelement <2 x double> %add7.i, i32 0 + // CHECK: %conv.i = fptosi double %vecext.i to i64 + // CHECK: %conv = trunc i64 %conv.i to i32 + // CHECK: ret i32 %conv + return _mm512_reduce_add_pd(__W); +} + +int test_mm512_reduce_mul_pd(__m512d __W){ + // CHECK: %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> + // CHECK: %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> + // CHECK: %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> + // CHECK: %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i + // CHECK: %vecext.i = extractelement <2 x double> %mul7.i, i32 0 + // CHECK: %conv.i = fptosi double %vecext.i to i64 + // CHECK: %conv = trunc i64 %conv.i to i32 + // CHECK: ret i32 %conv + return _mm512_reduce_mul_pd(__W); +} + +int test_mm512_reduce_add_ps(__m512 __W){ + // CHECK: %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> + // CHECK: %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> + // CHECK: %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> + // CHECK: %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> + // CHECK: %add7.i = fadd <4 x float> %add4.i, %shuffle6.i + // CHECK: %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> + // CHECK: %add10.i = fadd <4 x float> %add7.i, %shuffle9.i + // CHECK: %vecext.i = extractelement <4 x float> %add10.i, i32 0 + // CHECK: %conv.i = fptosi float %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_reduce_add_ps(__W); +} + +int test_mm512_reduce_mul_ps(__m512 __W){ + // CHECK: %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> + // CHECK: %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> + // CHECK: %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> + // CHECK: %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> + // CHECK: %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i + // CHECK: %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> + // CHECK: %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i + // CHECK: %vecext.i = extractelement <4 x float> %mul10.i, i32 0 + // CHECK: %conv.i = fptosi float %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_reduce_mul_ps(__W); +} + +int test_mm512_mask_reduce_add_pd(__m512d __W, __mmask8 __M){ + // CHECK: %0 = bitcast i8 %__M to <8 x i1> + // CHECK: %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer + // CHECK: %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> + // CHECK: %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> + // CHECK: %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> + // CHECK: %add7.i = fadd <2 x double> %add4.i, %shuffle6.i + // CHECK: %vecext.i = extractelement <2 x double> %add7.i, i32 0 + // CHECK: %conv.i = fptosi double %vecext.i to i64 + // CHECK: %conv = trunc i64 %conv.i to i32 + // CHECK: ret i32 %conv + return _mm512_mask_reduce_add_pd(__W , __M); +} + +int test_mm512_mask_reduce_mul_pd(__m512d __W, __mmask8 __M){ + // CHECK: %0 = bitcast i8 %__M to <8 x i1> + // CHECK: %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> + // CHECK: %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> + // CHECK: %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> + // CHECK: %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> + // CHECK: %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> + // CHECK: %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> + // CHECK: %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i + // CHECK: %vecext.i = extractelement <2 x double> %mul7.i, i32 0 + // CHECK: %conv.i = fptosi double %vecext.i to i64 + // CHECK: %conv = trunc i64 %conv.i to i32 + // CHECK: ret i32 %conv + return _mm512_mask_reduce_mul_pd(__W , __M); +} + +int test_mm512_mask_reduce_add_ps(__m512 __W, __mmask8 __M){ + // CHECK: %conv = zext i8 %__M to i16 + // CHECK: %0 = bitcast i16 %conv to <16 x i1> + // CHECK: %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer + // CHECK: %shuffle.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> + // CHECK: %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> + // CHECK: %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> + // CHECK: %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> + // CHECK: %add7.i = fadd <4 x float> %add4.i, %shuffle6.i + // CHECK: %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> + // CHECK: %add10.i = fadd <4 x float> %add7.i, %shuffle9.i + // CHECK: %vecext.i = extractelement <4 x float> %add10.i, i32 0 + // CHECK: %conv.i = fptosi float %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_mask_reduce_add_ps(__W , __M); +} + +int test_mm512_mask_reduce_mul_ps(__m512 __W, __mmask8 __M){ + // CHECK: %conv = zext i8 %__M to i16 + // CHECK: %0 = bitcast i16 %conv to <16 x i1> + // CHECK: %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> + // CHECK: %shuffle.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> + // CHECK: %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i + // CHECK: %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> + // CHECK: %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> + // CHECK: %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> + // CHECK: %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i + // CHECK: %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> + // CHECK: %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i + // CHECK: %vecext.i = extractelement <4 x float> %mul10.i, i32 0 + // CHECK: %conv.i = fptosi float %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_mask_reduce_mul_ps(__W , __M); +}