diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1878,6 +1878,10 @@ TARGET_BUILTIN(__builtin_ia32_reduce_and_q512, "OiV8Oi", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fadd_pd512, "ddV8d", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ps512, "ffV16f", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmax_pd512, "dV8d", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ps512, "fV16f", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmin_pd512, "dV8d", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ps512, "fV16f", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fmul_pd512, "ddV8d", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ps512, "ffV16f", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_mul_d512, "iV16i", "ncV:512:", "avx512f") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13826,16 +13826,34 @@ case X86::BI__builtin_ia32_reduce_fadd_ps512: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType()); - Builder.getFastMathFlags().setAllowReassoc(true); + Builder.getFastMathFlags().setAllowReassoc(); return Builder.CreateCall(F, {Ops[0], Ops[1]}); } case X86::BI__builtin_ia32_reduce_fmul_pd512: case X86::BI__builtin_ia32_reduce_fmul_ps512: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType()); - Builder.getFastMathFlags().setAllowReassoc(true); + Builder.getFastMathFlags().setAllowReassoc(); return Builder.CreateCall(F, {Ops[0], Ops[1]}); } + case X86::BI__builtin_ia32_reduce_fmax_pd512: + case X86::BI__builtin_ia32_reduce_fmax_ps512: { + Function *F = + CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType()); + FastMathFlags &FMF = Builder.getFastMathFlags(); + FMF.setNoNaNs(); + FMF.setNoSignedZeros(); + return Builder.CreateCall(F, {Ops[0]}); + } + case X86::BI__builtin_ia32_reduce_fmin_pd512: + case X86::BI__builtin_ia32_reduce_fmin_ps512: { + Function *F = + CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType()); + FastMathFlags &FMF = Builder.getFastMathFlags(); + FMF.setNoNaNs(); + FMF.setNoSignedZeros(); + return Builder.CreateCall(F, {Ops[0]}); + } case X86::BI__builtin_ia32_reduce_mul_d512: case X86::BI__builtin_ia32_reduce_mul_q512: { Function *F = diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -9300,8 +9300,10 @@ * computations. In vector-reduction arithmetic, the evaluation order is * independent of the order of the input elements of V. - * For floating point types, we always assume the elements are reassociable even - * if -fast-math is off. + * For floating point intrinsics, we have implicit assumptions: + * 1. The elements are reassociable when using fadd/fmul intrinsics; + * 2. There's no nan and signed zero in the elements when using fmin/max + intrinsics; * Used bisection method. At each step, we partition the vector with previous * step in half, and the operation is performed on its two halves. @@ -9524,75 +9526,49 @@ return __builtin_ia32_reduce_umin_d512((__v16si)__V); } -#define _mm512_mask_reduce_operator(op) \ - __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \ - __m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \ - __m256d __t3 = _mm256_##op(__t1, __t2); \ - __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \ - __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \ - __m128d __t6 = _mm_##op(__t4, __t5); \ - __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ - __m128d __t8 = _mm_##op(__t6, __t7); \ - return __t8[0] - static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_max_pd(__m512d __V) { - _mm512_mask_reduce_operator(max_pd); + return __builtin_ia32_reduce_fmax_pd512(__V); } static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_min_pd(__m512d __V) { - _mm512_mask_reduce_operator(min_pd); + return __builtin_ia32_reduce_fmin_pd512(__V); } static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V); - _mm512_mask_reduce_operator(max_pd); + return __builtin_ia32_reduce_fmax_pd512(__V); } static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V); - _mm512_mask_reduce_operator(min_pd); -} -#undef _mm512_mask_reduce_operator - -#define _mm512_mask_reduce_operator(op) \ - __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \ - __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \ - __m256 __t3 = _mm256_##op(__t1, __t2); \ - __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \ - __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \ - __m128 __t6 = _mm_##op(__t4, __t5); \ - __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ - __m128 __t8 = _mm_##op(__t6, __t7); \ - __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ - __m128 __t10 = _mm_##op(__t8, __t9); \ - return __t10[0] + return __builtin_ia32_reduce_fmin_pd512(__V); +} static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_reduce_max_ps(__m512 __V) { - _mm512_mask_reduce_operator(max_ps); + return __builtin_ia32_reduce_fmax_ps512(__V); } static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_reduce_min_ps(__m512 __V) { - _mm512_mask_reduce_operator(min_ps); + return __builtin_ia32_reduce_fmin_ps512(__V); } static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V); - _mm512_mask_reduce_operator(max_ps); + return __builtin_ia32_reduce_fmax_ps512(__V); } static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V); - _mm512_mask_reduce_operator(min_ps); + return __builtin_ia32_reduce_fmin_ps512(__V); } -#undef _mm512_mask_reduce_operator /// Moves the least significant 32 bits of a vector of [16 x i32] to a /// 32-bit signed integer value. diff --git a/clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c b/clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c --- a/clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c +++ b/clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c @@ -14,18 +14,14 @@ return _mm512_reduce_max_epu64(__W); } -double test_mm512_reduce_max_pd(__m512d __W){ - // CHECK-LABEL: @test_mm512_reduce_max_pd( - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> - // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> - // CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> - // CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - return _mm512_reduce_max_pd(__W); +double test_mm512_reduce_max_pd(__m512d __W, double ExtraAddOp){ +// CHECK-LABEL: @test_mm512_reduce_max_pd( +// CHECK-NOT: nnan +// CHECK-NOT: nsz +// CHECK: call nnan nsz double @llvm.vector.reduce.fmax.v8f64(<8 x double> %{{.*}}) +// CHECK-NOT: nnan +// CHECK-NOT: nsz + return _mm512_reduce_max_pd(__W) + ExtraAddOp; } long long test_mm512_reduce_min_epi64(__m512i __W){ @@ -40,18 +36,14 @@ return _mm512_reduce_min_epu64(__W); } -double test_mm512_reduce_min_pd(__m512d __W){ - // CHECK-LABEL: @test_mm512_reduce_min_pd( - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> - // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> - // CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> - // CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - return _mm512_reduce_min_pd(__W); +double test_mm512_reduce_min_pd(__m512d __W, double ExtraMulOp){ +// CHECK-LABEL: @test_mm512_reduce_min_pd( +// CHECK-NOT: nnan +// CHECK-NOT: nsz +// CHECK: call nnan nsz double @llvm.vector.reduce.fmin.v8f64(<8 x double> %{{.*}}) +// CHECK-NOT: nnan +// CHECK-NOT: nsz + return _mm512_reduce_min_pd(__W) * ExtraMulOp; } long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){ @@ -59,7 +51,7 @@ // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} // CHECK: call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %{{.*}}) - return _mm512_mask_reduce_max_epi64(__M, __W); + return _mm512_mask_reduce_max_epi64(__M, __W); } unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){ @@ -67,23 +59,15 @@ // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} // CHECK: call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %{{.*}}) - return _mm512_mask_reduce_max_epu64(__M, __W); + return _mm512_mask_reduce_max_epu64(__M, __W); } double test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){ - // CHECK-LABEL: @test_mm512_mask_reduce_max_pd( - // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> - // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> - // CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> - // CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - return _mm512_mask_reduce_max_pd(__M, __W); +// CHECK-LABEL: @test_mm512_mask_reduce_max_pd( +// CHECK: bitcast i8 %{{.*}} to <8 x i1> +// CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} +// CHECK: call nnan nsz double @llvm.vector.reduce.fmax.v8f64(<8 x double> %{{.*}}) + return _mm512_mask_reduce_max_pd(__M, __W); } long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){ @@ -91,7 +75,7 @@ // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} // CHECK: call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %{{.*}}) - return _mm512_mask_reduce_min_epi64(__M, __W); + return _mm512_mask_reduce_min_epi64(__M, __W); } unsigned long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){ @@ -99,23 +83,15 @@ // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} // CHECK: call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %{{.*}}) - return _mm512_mask_reduce_min_epu64(__M, __W); + return _mm512_mask_reduce_min_epu64(__M, __W); } double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){ - // CHECK-LABEL: @test_mm512_mask_reduce_min_pd( - // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> - // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> - // CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> - // CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - return _mm512_mask_reduce_min_pd(__M, __W); +// CHECK-LABEL: @test_mm512_mask_reduce_min_pd( +// CHECK: bitcast i8 %{{.*}} to <8 x i1> +// CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} +// CHECK: call nnan nsz double @llvm.vector.reduce.fmin.v8f64(<8 x double> %{{.*}}) + return _mm512_mask_reduce_min_pd(__M, __W); } int test_mm512_reduce_max_epi32(__m512i __W){ @@ -131,19 +107,9 @@ } float test_mm512_reduce_max_ps(__m512 __W){ - // CHECK-LABEL: define{{.*}} float @test_mm512_reduce_max_ps( - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}) - // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> - // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - return _mm512_reduce_max_ps(__W); +// CHECK-LABEL: @test_mm512_reduce_max_ps( +// CHECK: call nnan nsz float @llvm.vector.reduce.fmax.v16f32(<16 x float> %{{.*}}) + return _mm512_reduce_max_ps(__W); } int test_mm512_reduce_min_epi32(__m512i __W){ @@ -159,19 +125,9 @@ } float test_mm512_reduce_min_ps(__m512 __W){ - // CHECK-LABEL: define{{.*}} float @test_mm512_reduce_min_ps( - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}) - // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> - // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - return _mm512_reduce_min_ps(__W); +// CHECK-LABEL: @test_mm512_reduce_min_ps( +// CHECK: call nnan nsz float @llvm.vector.reduce.fmin.v16f32(<16 x float> %{{.*}}) + return _mm512_reduce_min_ps(__W); } int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){ @@ -179,7 +135,7 @@ // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} // CHECK: call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %{{.*}}) - return _mm512_mask_reduce_max_epi32(__M, __W); + return _mm512_mask_reduce_max_epi32(__M, __W); } unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){ @@ -187,25 +143,15 @@ // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} // CHECK: call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %{{.*}}) - return _mm512_mask_reduce_max_epu32(__M, __W); + return _mm512_mask_reduce_max_epu32(__M, __W); } float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){ - // CHECK-LABEL: define{{.*}} float @test_mm512_mask_reduce_max_ps( - // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}) - // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> - // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - return _mm512_mask_reduce_max_ps(__M, __W); +// CHECK-LABEL: @test_mm512_mask_reduce_max_ps( +// CHECK: bitcast i16 %{{.*}} to <16 x i1> +// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} +// CHECK: call nnan nsz float @llvm.vector.reduce.fmax.v16f32(<16 x float> %{{.*}}) + return _mm512_mask_reduce_max_ps(__M, __W); } int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){ @@ -213,7 +159,7 @@ // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} // CHECK: call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %{{.*}}) - return _mm512_mask_reduce_min_epi32(__M, __W); + return _mm512_mask_reduce_min_epi32(__M, __W); } unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){ @@ -221,24 +167,14 @@ // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} // CHECK: call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %{{.*}}) - return _mm512_mask_reduce_min_epu32(__M, __W); + return _mm512_mask_reduce_min_epu32(__M, __W); } float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){ - // CHECK-LABEL: define{{.*}} float @test_mm512_mask_reduce_min_ps( - // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> - // CHECK: call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}) - // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> - // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> - // CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - return _mm512_mask_reduce_min_ps(__M, __W); +// CHECK-LABEL: @test_mm512_mask_reduce_min_ps( +// CHECK: bitcast i16 %{{.*}} to <16 x i1> +// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} +// CHECK: call nnan nsz float @llvm.vector.reduce.fmin.v16f32(<16 x float> %{{.*}}) + return _mm512_mask_reduce_min_ps(__M, __W); }