Index: cfe/trunk/lib/Headers/avx512fintrin.h =================================================================== --- cfe/trunk/lib/Headers/avx512fintrin.h +++ cfe/trunk/lib/Headers/avx512fintrin.h @@ -9904,6 +9904,286 @@ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps); } +// Used bisection method. At each step, we partition the vector with previous +// step in half, and the operation is performed on its two halves. +// This takes log2(n) steps where n is the number of elements in the vector. +// This macro uses only intrinsics from the AVX512F feature. + +// Vec512 - Vector with size of 512. +// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example: +// __mm512_max_epi64 +// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}] +// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}] + +#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 0, 1, 2, 3, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 4, 5, 6, 7, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 0, 1, -1, -1, -1, -1, -1, -1),\ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 2, 3, -1, -1, -1, -1, -1, \ + -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 0, -1, -1, -1, -1, -1, -1, -1),\ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 1, -1, -1, -1, -1, -1, -1, -1))\ + ; \ + return Vec512[0]; \ + }) + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_reduce_max_epi64(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_reduce_max_epu64(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_reduce_max_pd(__m512d __V) { + _mm512_reduce_maxMin_64bit(__V, max_pd, d, f); +} + +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64 +(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, min_epi64, i, i); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_reduce_min_epu64(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_reduce_min_pd(__m512d __V) { + _mm512_reduce_maxMin_64bit(__V, min_pd, d, f); +} + +// Vec512 - Vector with size 512. +// Vec512Neutral - A 512 length vector with elements set to the identity element +// Identity element: {max_epi,0x8000000000000000} +// {max_epu,0x0000000000000000} +// {max_pd, 0xFFF0000000000000} +// {min_epi,0x7FFFFFFFFFFFFFFF} +// {min_epu,0xFFFFFFFFFFFFFFFF} +// {min_pd, 0x7FF0000000000000} +// +// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example: +// __mm512_max_epi64 +// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}] +// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}] +// T3 - Can get 'q' q word and 'pd' for packed double. +// [__builtin_ia32_select{q|pd}_512] +// Mask - Intrinsic Mask + +#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \ + T2, T3, Mask) \ + __extension__({ \ + Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ + (__mmask8)Mask, \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512Neutral); \ + _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2); \ + }) + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000), + max_epi64, i, i, q, __M); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000), + max_epu64, i, i, q, __M); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(0xFFF0000000000000), + max_pd, d, f, pd, __M); +} + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), + min_epi64, i, i, q, __M); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), + min_epu64, i, i, q, __M); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(0x7FF0000000000000), + min_pd, d, f, pd, __M); +} + +// Vec512 - Vector with size 512. +// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example: +// __mm512_max_epi32 +// T1 - Can get 'i' for int and ' ' .[__m512{i|}] +// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}] + +#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, 1, 2, 3, 4, 5, 6, 7, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, 1, 2, 3, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 4, 5, 6, 7, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, 1, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 2, 3, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, -1, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 1, -1, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + return Vec512[0]; \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, max_epi32, i, i); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_reduce_max_epu32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, max_epu32, i, i); +} + +static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) { + _mm512_reduce_maxMin_32bit(a, max_ps, , f); +} + +static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, min_epi32, i, i); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_reduce_min_epu32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, min_epu32, i, i); +} + +static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) { + _mm512_reduce_maxMin_32bit(a, min_ps, , f); +} + +// Vec512 - Vector with size 512. +// Vec512Neutral - A 512 length vector with elements set to the identity element +// Identity element: {max_epi,0x80000000} +// {max_epu,0x00000000} +// {max_ps, 0xFF800000} +// {min_epi,0x7FFFFFFF} +// {min_epu,0xFFFFFFFF} +// {min_ps, 0x7F800000} +// +// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example: +// __mm512_max_epi32 +// T1 - Can get 'i' for int and ' ' .[__m512{i|}] +// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}] +// T3 - Can get 'q' q word and 'pd' for packed double. +// [__builtin_ia32_select{q|pd}_512] +// Mask - Intrinsic Mask + +#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \ + T2, T3, Mask) \ + __extension__({ \ + Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ + (__mmask16)Mask, \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512Neutral); \ + _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2); \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32, + i, i, d, __M); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32, + i, i, d, __M); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(0xFF800000), max_ps, , f, + ps, __M); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32, + i, i, d, __M); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32, + i, i, d, __M); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(0x7F800000), min_ps, , f, + ps, __M); +} + #undef __DEFAULT_FN_ATTRS #endif // __AVX512FINTRIN_H Index: cfe/trunk/test/CodeGen/avx512-reduceMinMaxIntrin.c =================================================================== --- cfe/trunk/test/CodeGen/avx512-reduceMinMaxIntrin.c +++ cfe/trunk/test/CodeGen/avx512-reduceMinMaxIntrin.c @@ -0,0 +1,437 @@ +// RUN: %clang_cc1 -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror |opt -instnamer -S |FileCheck %s + +#include + +long long test_mm512_reduce_max_epi64(__m512i __W){ + // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> + // CHECK: %tmp = icmp slt <8 x i64> %shuffle1.i, %__W + // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i + // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> + // CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle3.i + // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> + // CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle6.i + // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0 + // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0 + // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1 + // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i + // CHECK: ret i64 %vecext.i + return _mm512_reduce_max_epi64(__W); +} + +unsigned long long test_mm512_reduce_max_epu64(__m512i __W){ + // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> + // CHECK: %tmp = icmp ult <8 x i64> %shuffle1.i, %__W + // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i + // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> + // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle3.i + // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> + // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle6.i + // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0 + // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0 + // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1 + // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i + // CHECK: ret i64 %vecext.i + return _mm512_reduce_max_epu64(__W); +} + +double test_mm512_reduce_max_pd(__m512d __W){ + // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <8 x i32> + // CHECK: %tmp = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %__W, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %shuffle3.i = shufflevector <8 x double> %tmp, <8 x double> undef, <8 x i32> + // CHECK: %tmp1 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp, <8 x double> %shuffle3.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %shuffle6.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> + // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp1, <8 x double> %shuffle6.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %vecext.i = extractelement <8 x double> %tmp2, i32 0 + // CHECK: ret double %vecext.i + return _mm512_reduce_max_pd(__W); +} + +long long test_mm512_reduce_min_epi64(__m512i __W){ + // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> + // CHECK: %tmp = icmp slt <8 x i64> %shuffle1.i, %__W + // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i + // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> + // CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle3.i + // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> + // CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle6.i + // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0 + // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0 + // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1 + // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i + // CHECK: ret i64 %vecext.i + return _mm512_reduce_max_epi64(__W); +} + +unsigned long long test_mm512_reduce_min_epu64(__m512i __W){ + // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> + // CHECK: %tmp = icmp ult <8 x i64> %shuffle1.i, %__W + // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i + // CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> + // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle3.i + // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> + // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle6.i + // CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0 + // CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0 + // CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1 + // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i + // CHECK: ret i64 %vecext.i + return _mm512_reduce_max_epu64(__W); +} + +double test_mm512_reduce_min_pd(__m512d __W){ + // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <8 x i32> + // CHECK: %tmp = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %__W, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %shuffle3.i = shufflevector <8 x double> %tmp, <8 x double> undef, <8 x i32> + // CHECK: %tmp1 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp, <8 x double> %shuffle3.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %shuffle6.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> + // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp1, <8 x double> %shuffle6.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %vecext.i = extractelement <8 x double> %tmp2, i32 0 + // CHECK: ret double %vecext.i + return _mm512_reduce_min_pd(__W); +} + +long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){ + // CHECK: %tmp = bitcast i8 %__M to <8 x i1> + // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> + // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> + // CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle1.i + // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i + // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> + // CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle4.i + // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i + // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> + // CHECK: %tmp6 = icmp sgt <8 x i64> %tmp5, %shuffle7.i + // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0 + // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0 + // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1 + // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i + // CHECK: ret i64 %vecext.i + return _mm512_mask_reduce_max_epi64(__M, __W); +} + +unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){ + // CHECK: %tmp = bitcast i8 %__M to <8 x i1> + // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> zeroinitializer + // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> + // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle1.i + // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i + // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> + // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle4.i + // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i + // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> + // CHECK: %tmp6 = icmp ugt <8 x i64> %tmp5, %shuffle7.i + // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0 + // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0 + // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1 + // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i + // CHECK: ret i64 %vecext.i + return _mm512_mask_reduce_max_epu64(__M, __W); +} + +long long test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){ + // CHECK: %tmp = bitcast i8 %__M to <8 x i1> + // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x double> %__W, <8 x double> + // CHECK: %shuffle1.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> + // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp1, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %shuffle4.i = shufflevector <8 x double> %tmp2, <8 x double> undef, <8 x i32> + // CHECK: %tmp3 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp2, <8 x double> %shuffle4.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %shuffle7.i = shufflevector <8 x double> %tmp3, <8 x double> undef, <8 x i32> + // CHECK: %tmp4 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp3, <8 x double> %shuffle7.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %vecext.i = extractelement <8 x double> %tmp4, i32 0 + // CHECK: %conv = fptosi double %vecext.i to i64 + // CHECK: ret i64 %conv + return _mm512_mask_reduce_max_pd(__M, __W); +} + +long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){ + // CHECK: %tmp = bitcast i8 %__M to <8 x i1> + // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> + // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> + // CHECK: %tmp2 = icmp slt <8 x i64> %tmp1, %shuffle1.i + // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i + // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> + // CHECK: %tmp4 = icmp slt <8 x i64> %tmp3, %shuffle4.i + // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i + // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> + // CHECK: %tmp6 = icmp slt <8 x i64> %tmp5, %shuffle7.i + // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0 + // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0 + // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1 + // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i + // CHECK: ret i64 %vecext.i + return _mm512_mask_reduce_min_epi64(__M, __W); +} + +long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){ + // CHECK: %tmp = bitcast i8 %__M to <8 x i1> + // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> zeroinitializer + // CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> + // CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle1.i + // CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i + // CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> + // CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle4.i + // CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i + // CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> + // CHECK: %tmp6 = icmp ugt <8 x i64> %tmp5, %shuffle7.i + // CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0 + // CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0 + // CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1 + // CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i + // CHECK: ret i64 %vecext.i + return _mm512_mask_reduce_max_epu64(__M, __W); +} + +double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){ + // CHECK: %tmp = bitcast i8 %__M to <8 x i1> + // CHECK: %tmp1 = select <8 x i1> %tmp, <8 x double> %__W, <8 x double> + // CHECK: %shuffle1.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> + // CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp1, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %shuffle4.i = shufflevector <8 x double> %tmp2, <8 x double> undef, <8 x i32> + // CHECK: %tmp3 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp2, <8 x double> %shuffle4.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %shuffle7.i = shufflevector <8 x double> %tmp3, <8 x double> undef, <8 x i32> + // CHECK: %tmp4 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp3, <8 x double> %shuffle7.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3 + // CHECK: %vecext.i = extractelement <8 x double> %tmp4, i32 0 + // CHECK: ret double %vecext.i + return _mm512_mask_reduce_min_pd(__M, __W); +} + +int test_mm512_reduce_max_epi32(__m512i __W){ + // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> + // CHECK: %tmp1 = icmp sgt <16 x i32> %tmp, %shuffle1.i + // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i + // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> + // CHECK: %tmp3 = icmp sgt <16 x i32> %tmp2, %shuffle3.i + // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> + // CHECK: %tmp5 = icmp sgt <16 x i32> %tmp4, %shuffle6.i + // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i + // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> + // CHECK: %tmp7 = icmp sgt <16 x i32> %tmp6, %shuffle9.i + // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i + // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64> + // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_reduce_max_epi32(__W); +} + +unsigned int test_mm512_reduce_max_epu32(__m512i __W){ + // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> + // CHECK: %tmp1 = icmp ugt <16 x i32> %tmp, %shuffle1.i + // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i + // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> + // CHECK: %tmp3 = icmp ugt <16 x i32> %tmp2, %shuffle3.i + // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> + // CHECK: %tmp5 = icmp ugt <16 x i32> %tmp4, %shuffle6.i + // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i + // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> + // CHECK: %tmp7 = icmp ugt <16 x i32> %tmp6, %shuffle9.i + // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i + // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64> + // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_reduce_max_epu32(__W); +} + +float test_mm512_reduce_max_ps(__m512 __W){ + // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <16 x i32> + // CHECK: %tmp = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %__W, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle3.i = shufflevector <16 x float> %tmp, <16 x float> undef, <16 x i32> + // CHECK: %tmp1 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp, <16 x float> %shuffle3.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle6.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> + // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp1, <16 x float> %shuffle6.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle9.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> + // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp2, <16 x float> %shuffle9.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %vecext.i = extractelement <16 x float> %tmp3, i32 0 + // CHECK: ret float %vecext.i + return _mm512_reduce_max_ps(__W); +} + +int test_mm512_reduce_min_epi32(__m512i __W){ + // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> + // CHECK: %tmp1 = icmp slt <16 x i32> %tmp, %shuffle1.i + // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i + // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> + // CHECK: %tmp3 = icmp slt <16 x i32> %tmp2, %shuffle3.i + // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> + // CHECK: %tmp5 = icmp slt <16 x i32> %tmp4, %shuffle6.i + // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i + // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> + // CHECK: %tmp7 = icmp slt <16 x i32> %tmp6, %shuffle9.i + // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i + // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64> + // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_reduce_min_epi32(__W); +} + +unsigned int test_mm512_reduce_min_epu32(__m512i __W){ + // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> + // CHECK: %tmp1 = icmp ult <16 x i32> %tmp, %shuffle1.i + // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i + // CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> + // CHECK: %tmp3 = icmp ult <16 x i32> %tmp2, %shuffle3.i + // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i + // CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> + // CHECK: %tmp5 = icmp ult <16 x i32> %tmp4, %shuffle6.i + // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i + // CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> + // CHECK: %tmp7 = icmp ult <16 x i32> %tmp6, %shuffle9.i + // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i + // CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64> + // CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_reduce_min_epu32(__W); +} + +float test_mm512_reduce_min_ps(__m512 __W){ + // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <16 x i32> + // CHECK: %tmp = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %__W, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle3.i = shufflevector <16 x float> %tmp, <16 x float> undef, <16 x i32> + // CHECK: %tmp1 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp, <16 x float> %shuffle3.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle6.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> + // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp1, <16 x float> %shuffle6.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle9.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> + // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp2, <16 x float> %shuffle9.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %vecext.i = extractelement <16 x float> %tmp3, i32 0 + // CHECK: ret float %vecext.i + return _mm512_reduce_min_ps(__W); +} + +int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){ + // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1> + // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> + // CHECK: %tmp3 = icmp sgt <16 x i32> %tmp2, %shuffle1.i + // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i + // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> + // CHECK: %tmp5 = icmp sgt <16 x i32> %tmp4, %shuffle4.i + // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i + // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> + // CHECK: %tmp7 = icmp sgt <16 x i32> %tmp6, %shuffle7.i + // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i + // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> + // CHECK: %tmp9 = icmp sgt <16 x i32> %tmp8, %shuffle10.i + // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i + // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64> + // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_mask_reduce_max_epi32(__M, __W); +} + +unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){ + // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1> + // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> zeroinitializer + // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> + // CHECK: %tmp3 = icmp ugt <16 x i32> %tmp2, %shuffle1.i + // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i + // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> + // CHECK: %tmp5 = icmp ugt <16 x i32> %tmp4, %shuffle4.i + // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i + // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> + // CHECK: %tmp7 = icmp ugt <16 x i32> %tmp6, %shuffle7.i + // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i + // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> + // CHECK: %tmp9 = icmp ugt <16 x i32> %tmp8, %shuffle10.i + // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i + // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64> + // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_mask_reduce_max_epu32(__M, __W); +} + +float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){ + // CHECK: %tmp = bitcast i16 %__M to <16 x i1> + // CHECK: %tmp1 = select <16 x i1> %tmp, <16 x float> %__W, <16 x float> + // CHECK: %shuffle1.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> + // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp1, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle4.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> + // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp2, <16 x float> %shuffle4.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle7.i = shufflevector <16 x float> %tmp3, <16 x float> undef, <16 x i32> + // CHECK: %tmp4 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp3, <16 x float> %shuffle7.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle10.i = shufflevector <16 x float> %tmp4, <16 x float> undef, <16 x i32> + // CHECK: %tmp5 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp4, <16 x float> %shuffle10.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %vecext.i = extractelement <16 x float> %tmp5, i32 0 + // CHECK: ret float %vecext.i + return _mm512_mask_reduce_max_ps(__M, __W); +} + +int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){ + // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1> + // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> + // CHECK: %tmp3 = icmp slt <16 x i32> %tmp2, %shuffle1.i + // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i + // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> + // CHECK: %tmp5 = icmp slt <16 x i32> %tmp4, %shuffle4.i + // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i + // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> + // CHECK: %tmp7 = icmp slt <16 x i32> %tmp6, %shuffle7.i + // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i + // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> + // CHECK: %tmp9 = icmp slt <16 x i32> %tmp8, %shuffle10.i + // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i + // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64> + // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_mask_reduce_min_epi32(__M, __W); +} + +unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){ + // CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32> + // CHECK: %tmp1 = bitcast i16 %__M to <16 x i1> + // CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> + // CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> + // CHECK: %tmp3 = icmp ult <16 x i32> %tmp2, %shuffle1.i + // CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i + // CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> + // CHECK: %tmp5 = icmp ult <16 x i32> %tmp4, %shuffle4.i + // CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i + // CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> + // CHECK: %tmp7 = icmp ult <16 x i32> %tmp6, %shuffle7.i + // CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i + // CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> + // CHECK: %tmp9 = icmp ult <16 x i32> %tmp8, %shuffle10.i + // CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i + // CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64> + // CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0 + // CHECK: %conv.i = trunc i64 %vecext.i to i32 + // CHECK: ret i32 %conv.i + return _mm512_mask_reduce_min_epu32(__M, __W); +} + +float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){ + // CHECK: %tmp = bitcast i16 %__M to <16 x i1> + // CHECK: %tmp1 = select <16 x i1> %tmp, <16 x float> %__W, <16 x float> + // CHECK: %shuffle1.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> + // CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp1, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle4.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> + // CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp2, <16 x float> %shuffle4.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle7.i = shufflevector <16 x float> %tmp3, <16 x float> undef, <16 x i32> + // CHECK: %tmp4 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp3, <16 x float> %shuffle7.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %shuffle10.i = shufflevector <16 x float> %tmp4, <16 x float> undef, <16 x i32> + // CHECK: %tmp5 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp4, <16 x float> %shuffle10.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3 + // CHECK: %vecext.i = extractelement <16 x float> %tmp5, i32 0 + // CHECK: ret float %vecext.i + return _mm512_mask_reduce_min_ps(__M, __W); +} +