Index: lib/Headers/avx512fintrin.h =================================================================== --- lib/Headers/avx512fintrin.h +++ lib/Headers/avx512fintrin.h @@ -9281,251 +9281,185 @@ * Used bisection method. At each step, we partition the vector with previous * step in half, and the operation is performed on its two halves. * This takes log2(n) steps where n is the number of elements in the vector. - - * Vec512 - Vector with size 512. - * Operator - Can be one of following: +,*,&,| - * T2 - Can get 'i' for int and 'f' for float. - * T1 - Can get 'i' for int and 'd' for double. */ -#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1) \ - __extension__({ \ - __m256##T1 Vec256 = __builtin_shufflevector( \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512, \ - 0, 1, 2, 3) \ - Operator \ - __builtin_shufflevector( \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512, \ - 4, 5, 6, 7); \ - __m128##T1 Vec128 = __builtin_shufflevector( \ - (__v4d##T2)Vec256, \ - (__v4d##T2)Vec256, \ - 0, 1) \ - Operator \ - __builtin_shufflevector( \ - (__v4d##T2)Vec256, \ - (__v4d##T2)Vec256, \ - 2, 3); \ - Vec128 = __builtin_shufflevector((__v2d##T2)Vec128, \ - (__v2d##T2)Vec128, 0, -1) \ - Operator \ - __builtin_shufflevector((__v2d##T2)Vec128, \ - (__v2d##T2)Vec128, 1, -1); \ - return Vec128[0]; \ - }) +#define _mm512_mask_reduce_operator(op) \ + __v4du __t1 = (__v4du)_mm512_extracti64x4_epi64(__W, 0); \ + __v4du __t2 = (__v4du)_mm512_extracti64x4_epi64(__W, 1); \ + __m256i __t3 = (__m256i)(__t1 op __t2); \ + __v2du __t4 = (__v2du)_mm256_extracti128_si256(__t3, 0); \ + __v2du __t5 = (__v2du)_mm256_extracti128_si256(__t3, 1); \ + __v2du __t6 = __t4 op __t5; \ + __v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ + __v2du __t8 = __t6 op __t7; \ + return __t8[0]; static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) { - _mm512_reduce_operator_64bit(__W, +, i, i); + _mm512_mask_reduce_operator(+); } static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) { - _mm512_reduce_operator_64bit(__W, *, i, i); + _mm512_mask_reduce_operator(*); } static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) { - _mm512_reduce_operator_64bit(__W, &, i, i); + _mm512_mask_reduce_operator(&); } static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) { - _mm512_reduce_operator_64bit(__W, |, i, i); -} - -static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) { - _mm512_reduce_operator_64bit(__W, +, f, d); + _mm512_mask_reduce_operator(|); } -static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) { - _mm512_reduce_operator_64bit(__W, *, f, d); -} - -/* Vec512 - Vector with size 512. - * Vec512Neutral - All vector elements set to the identity element. - * Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0} - * Operator - Can be one of following: +,*,&,| - * Mask - Intrinsic Mask - * T2 - Can get 'i' for int and 'f' for float. - * T1 - Can get 'i' for int and 'd' for packed double-precision. - * T3 - Can be Pd for packed double or q for q-word. - */ - -#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator, \ - Mask, T2, T1, T3) \ - __extension__({ \ - Vec512 = __builtin_ia32_select##T3##_512( \ - (__mmask8)Mask, \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512Neutral); \ - _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1); \ - }) - static __inline__ long long __DEFAULT_FN_ATTRS _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q); + __W = _mm512_maskz_mov_epi64(__M, __W); + _mm512_mask_reduce_operator(+); } static __inline__ long long __DEFAULT_FN_ATTRS _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q); + __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W); + _mm512_mask_reduce_operator(*); } static __inline__ long long __DEFAULT_FN_ATTRS _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), - &, __M, i, i, q); + __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W); + _mm512_mask_reduce_operator(&); } static __inline__ long long __DEFAULT_FN_ATTRS _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M, - i, i, q); + __W = _mm512_maskz_mov_epi64(__M, __W); + _mm512_mask_reduce_operator(|); +} +#undef _mm512_mask_reduce_operator + +#define _mm512_mask_reduce_operator(op) \ + __m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \ + __m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \ + __m256d __t3 = __t1 op __t2; \ + __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \ + __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \ + __m128d __t6 = __t4 op __t5; \ + __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ + __m128d __t8 = __t6 op __t7; \ + return __t8[0]; + +static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) { + _mm512_mask_reduce_operator(+); +} + +static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) { + _mm512_mask_reduce_operator(*); } static __inline__ double __DEFAULT_FN_ATTRS _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M, - f, d, pd); + __W = _mm512_maskz_mov_pd(__M, __W); + _mm512_mask_reduce_operator(+); } static __inline__ double __DEFAULT_FN_ATTRS _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M, - f, d, pd); + __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W); + _mm512_mask_reduce_operator(*); } -#undef _mm512_reduce_operator_64bit -#undef _mm512_mask_reduce_operator_64bit - -/* Vec512 - Vector with size 512. - * Operator - Can be one of following: +,*,&,| - * T2 - Can get 'i' for int and ' ' for packed single. - * T1 - Can get 'i' for int and 'f' for float. - */ +#undef _mm512_mask_reduce_operator -#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \ - __m256##T1 Vec256 = \ - (__m256##T1)(__builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 0, 1, 2, 3, 4, 5, 6, 7) \ - Operator \ - __builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 8, 9, 10, 11, 12, 13, 14, 15)); \ - __m128##T1 Vec128 = \ - (__m128##T1)(__builtin_shufflevector( \ - (__v8s##T2)Vec256, \ - (__v8s##T2)Vec256, \ - 0, 1, 2, 3) \ - Operator \ - __builtin_shufflevector( \ - (__v8s##T2)Vec256, \ - (__v8s##T2)Vec256, \ - 4, 5, 6, 7)); \ - Vec128 = (__m128##T1)(__builtin_shufflevector( \ - (__v4s##T2)Vec128, \ - (__v4s##T2)Vec128, \ - 0, 1, -1, -1) \ - Operator \ - __builtin_shufflevector( \ - (__v4s##T2)Vec128, \ - (__v4s##T2)Vec128, \ - 2, 3, -1, -1)); \ - Vec128 = (__m128##T1)(__builtin_shufflevector( \ - (__v4s##T2)Vec128, \ - (__v4s##T2)Vec128, \ - 0, -1, -1, -1) \ - Operator \ - __builtin_shufflevector( \ - (__v4s##T2)Vec128, \ - (__v4s##T2)Vec128, \ - 1, -1, -1, -1)); \ - return Vec128[0]; \ - }) +#define _mm512_mask_reduce_operator(op) \ + __v8su __t1 = (__v8su)_mm512_extracti64x4_epi64(__W, 0); \ + __v8su __t2 = (__v8su)_mm512_extracti64x4_epi64(__W, 1); \ + __m256i __t3 = (__m256i)(__t1 op __t2); \ + __v4su __t4 = (__v4su)_mm256_extracti128_si256(__t3, 0); \ + __v4su __t5 = (__v4su)_mm256_extracti128_si256(__t3, 1); \ + __v4su __t6 = __t4 op __t5; \ + __v4su __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ + __v4su __t8 = __t6 op __t7; \ + __v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ + __v4su __t10 = __t8 op __t9; \ + return __t10[0]; static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_add_epi32(__m512i __W) { - _mm512_reduce_operator_32bit(__W, +, i, i); + _mm512_mask_reduce_operator(+); } static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi32(__m512i __W) { - _mm512_reduce_operator_32bit(__W, *, i, i); + _mm512_mask_reduce_operator(*); } static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_and_epi32(__m512i __W) { - _mm512_reduce_operator_32bit(__W, &, i, i); + _mm512_mask_reduce_operator(&); } static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_or_epi32(__m512i __W) { - _mm512_reduce_operator_32bit(__W, |, i, i); -} - -static __inline__ float __DEFAULT_FN_ATTRS -_mm512_reduce_add_ps(__m512 __W) { - _mm512_reduce_operator_32bit(__W, +, f, ); + _mm512_mask_reduce_operator(|); } -static __inline__ float __DEFAULT_FN_ATTRS -_mm512_reduce_mul_ps(__m512 __W) { - _mm512_reduce_operator_32bit(__W, *, f, ); -} - -/* Vec512 - Vector with size 512. - * Vec512Neutral - All vector elements set to the identity element. - * Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0} - * Operator - Can be one of following: +,*,&,| - * Mask - Intrinsic Mask - * T2 - Can get 'i' for int and 'f' for float. - * T1 - Can get 'i' for int and 'd' for double. - * T3 - Can be Ps for packed single or d for d-word. - */ - -#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator, \ - Mask, T2, T1, T3) \ - __extension__({ \ - Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ - (__mmask16)Mask, \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512Neutral); \ - _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1); \ - }) - static __inline__ int __DEFAULT_FN_ATTRS _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d); + __W = _mm512_maskz_mov_epi32(__M, __W); + _mm512_mask_reduce_operator(+); } static __inline__ int __DEFAULT_FN_ATTRS _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d); + __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W); + _mm512_mask_reduce_operator(*); } static __inline__ int __DEFAULT_FN_ATTRS _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M, - i, i, d); + __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W); + _mm512_mask_reduce_operator(&); } static __inline__ int __DEFAULT_FN_ATTRS _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), |, __M, i, i, d); + __W = _mm512_maskz_mov_epi32(__M, __W); + _mm512_mask_reduce_operator(|); +} +#undef _mm512_mask_reduce_operator + +#define _mm512_mask_reduce_operator(op) \ + __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \ + __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \ + __m256 __t3 = __t1 op __t2; \ + __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \ + __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \ + __m128 __t6 = __t4 op __t5; \ + __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ + __m128 __t8 = __t6 op __t7; \ + __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ + __m128 __t10 = __t8 op __t9; \ + return __t10[0]; + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_reduce_add_ps(__m512 __W) { + _mm512_mask_reduce_operator(+); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_reduce_mul_ps(__m512 __W) { + _mm512_mask_reduce_operator(*); } static __inline__ float __DEFAULT_FN_ATTRS _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps); + __W = _mm512_maskz_mov_ps(__M, __W); + _mm512_mask_reduce_operator(+); } static __inline__ float __DEFAULT_FN_ATTRS _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps); + __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W); + _mm512_mask_reduce_operator(*); } -#undef _mm512_reduce_operator_32bit -#undef _mm512_mask_reduce_operator_32bit +#undef _mm512_mask_reduce_operator #define _mm512_mask_reduce_operator(op) \ __m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \ Index: test/CodeGen/avx512-reduceIntrin.c =================================================================== --- test/CodeGen/avx512-reduceIntrin.c +++ test/CodeGen/avx512-reduceIntrin.c @@ -1,410 +1,404 @@ -// RUN: %clang_cc1 -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s #include long long test_mm512_reduce_add_epi64(__m512i __W){ - // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> - // CHECK: %add.i = add <4 x i64> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> - // CHECK: %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> - // CHECK: %add7.i = add <2 x i64> %shuffle6.i, %add4.i - // CHECK: %vecext.i = extractelement <2 x i64> %add7.i, i32 0 - // CHECK: ret i64 %vecext.i +// CHECK-LABEL: @test_mm512_reduce_add_epi64( +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: add <4 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: add <2 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> +// CHECK: add <2 x i64> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x i64> %{{.*}}, i32 0 return _mm512_reduce_add_epi64(__W); } long long test_mm512_reduce_mul_epi64(__m512i __W){ - // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> - // CHECK: %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> - // CHECK: %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> - // CHECK: %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i - // CHECK: %vecext.i = extractelement <2 x i64> %mul7.i, i32 0 - // CHECK: ret i64 %vecext.i +// CHECK-LABEL: @test_mm512_reduce_mul_epi64( +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: mul <4 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: mul <2 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> +// CHECK: mul <2 x i64> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x i64> %{{.*}}, i32 0 return _mm512_reduce_mul_epi64(__W); } long long test_mm512_reduce_or_epi64(__m512i __W){ - // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> - // CHECK: %or.i = or <4 x i64> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> - // CHECK: %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> - // CHECK: %or7.i = or <2 x i64> %shuffle6.i, %or4.i - // CHECK: %vecext.i = extractelement <2 x i64> %or7.i, i32 0 - // CHECK: ret i64 %vecext.i +// CHECK-LABEL: @test_mm512_reduce_or_epi64( +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: or <4 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: or <2 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> +// CHECK: or <2 x i64> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x i64> %{{.*}}, i32 0 return _mm512_reduce_or_epi64(__W); } long long test_mm512_reduce_and_epi64(__m512i __W){ - // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> - // CHECK: %and.i = and <4 x i64> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> - // CHECK: %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> - // CHECK: %and7.i = and <2 x i64> %shuffle6.i, %and4.i - // CHECK: %vecext.i = extractelement <2 x i64> %and7.i, i32 0 - // CHECK: ret i64 %vecext.i +// CHECK-LABEL: @test_mm512_reduce_and_epi64( +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: and <4 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: and <2 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> +// CHECK: and <2 x i64> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x i64> %{{.*}}, i32 0 return _mm512_reduce_and_epi64(__W); } long long test_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W){ - // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1> - // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> zeroinitializer - // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> - // CHECK: %add.i = add <4 x i64> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> - // CHECK: %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> - // CHECK: %add7.i = add <2 x i64> %shuffle6.i, %add4.i - // CHECK: %vecext.i = extractelement <2 x i64> %add7.i, i32 0 - // CHECK: ret i64 %vecext.i +// CHECK-LABEL: @test_mm512_mask_reduce_add_epi64( +// CHECK: bitcast i8 %{{.*}} to <8 x i1> +// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: add <4 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: add <2 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> +// CHECK: add <2 x i64> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x i64> %{{.*}}, i32 0 return _mm512_mask_reduce_add_epi64(__M, __W); } long long test_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W){ - // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1> - // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> - // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> - // CHECK: %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> - // CHECK: %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> - // CHECK: %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i - // CHECK: %vecext.i = extractelement <2 x i64> %mul7.i, i32 0 - // CHECK: ret i64 %vecext.i +// CHECK-LABEL: @test_mm512_mask_reduce_mul_epi64( +// CHECK: bitcast i8 %{{.*}} to <8 x i1> +// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: mul <4 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: mul <2 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> +// CHECK: mul <2 x i64> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x i64> %{{.*}}, i32 0 return _mm512_mask_reduce_mul_epi64(__M, __W); } long long test_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W){ - // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1> - // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> - // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> - // CHECK: %and.i = and <4 x i64> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> - // CHECK: %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> - // CHECK: %and7.i = and <2 x i64> %shuffle6.i, %and4.i - // CHECK: %vecext.i = extractelement <2 x i64> %and7.i, i32 0 +// CHECK-LABEL: @test_mm512_mask_reduce_and_epi64( +// CHECK: bitcast i8 %{{.*}} to <8 x i1> +// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: and <4 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: and <2 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> +// CHECK: and <2 x i64> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x i64> %{{.*}}, i32 0 return _mm512_mask_reduce_and_epi64(__M, __W); } long long test_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W){ - // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1> - // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> zeroinitializer - // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> - // CHECK: %or.i = or <4 x i64> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> - // CHECK: %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> - // CHECK: %or7.i = or <2 x i64> %shuffle6.i, %or4.i - // CHECK: %vecext.i = extractelement <2 x i64> %or7.i, i32 0 - // CHECK: ret i64 %vecext.i +// CHECK-LABEL: @test_mm512_mask_reduce_or_epi64( +// CHECK: bitcast i8 %{{.*}} to <8 x i1> +// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: or <4 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: or <2 x i64> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> +// CHECK: or <2 x i64> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x i64> %{{.*}}, i32 0 return _mm512_mask_reduce_or_epi64(__M, __W); } int test_mm512_reduce_add_epi32(__m512i __W){ - // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32> - // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %add.i = add <8 x i32> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> - // CHECK: %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> - // CHECK: %add7.i = add <4 x i32> %shuffle6.i, %add4.i - // CHECK: %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> - // CHECK: %add10.i = add <4 x i32> %shuffle9.i, %add7.i - // CHECK: {{.*}} = bitcast <4 x i32> %add10.i to <2 x i64> - // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0 - // CHECK: %conv.i = trunc i64 %vecext.i to i32 - // CHECK: ret i32 %conv.i +// CHECK-LABEL: @test_mm512_reduce_add_epi32( +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: add <8 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: add <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: add <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: add <4 x i32> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x i32> %{{.*}}, i32 0 return _mm512_reduce_add_epi32(__W); } int test_mm512_reduce_mul_epi32(__m512i __W){ - // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32> - // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> - // CHECK: %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> - // CHECK: %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i - // CHECK: %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> - // CHECK: %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i - // CHECK: {{.*}} = bitcast <4 x i32> %mul10.i to <2 x i64> - // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0 - // CHECK: %conv.i = trunc i64 %vecext.i to i32 - // CHECK: ret i32 %conv.i +// CHECK-LABEL: @test_mm512_reduce_mul_epi32( +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: mul <8 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: mul <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: mul <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: mul <4 x i32> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x i32> %{{.*}}, i32 0 return _mm512_reduce_mul_epi32(__W); } int test_mm512_reduce_or_epi32(__m512i __W){ - // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32> - // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %or.i = or <8 x i32> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> - // CHECK: %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> - // CHECK: %or7.i = or <4 x i32> %shuffle6.i, %or4.i - // CHECK: %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> - // CHECK: %or10.i = or <4 x i32> %shuffle9.i, %or7.i - // CHECK: {{.*}} = bitcast <4 x i32> %or10.i to <2 x i64> - // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0 - // CHECK: %conv.i = trunc i64 %vecext.i to i32 - // CHECK: ret i32 %conv.i +// CHECK-LABEL: @test_mm512_reduce_or_epi32( +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: or <8 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: or <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: or <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: or <4 x i32> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x i32> %{{.*}}, i32 0 return _mm512_reduce_or_epi32(__W); } int test_mm512_reduce_and_epi32(__m512i __W){ - // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32> - // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %and.i = and <8 x i32> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> - // CHECK: %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> - // CHECK: %and7.i = and <4 x i32> %shuffle6.i, %and4.i - // CHECK: %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> - // CHECK: %and10.i = and <4 x i32> %shuffle9.i, %and7.i - // CHECK: {{.*}} = bitcast <4 x i32> %and10.i to <2 x i64> - // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0 - // CHECK: %conv.i = trunc i64 %vecext.i to i32 - // CHECK: ret i32 %conv.i +// CHECK-LABEL: @test_mm512_reduce_and_epi32( +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: and <8 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: and <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: and <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: and <4 x i32> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x i32> %{{.*}}, i32 0 return _mm512_reduce_and_epi32(__W); } int test_mm512_mask_reduce_add_epi32(__mmask16 __M, __m512i __W){ - // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32> - // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1> - // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> zeroinitializer - // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %add.i = add <8 x i32> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> - // CHECK: %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> - // CHECK: %add7.i = add <4 x i32> %shuffle6.i, %add4.i - // CHECK: %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> - // CHECK: %add10.i = add <4 x i32> %shuffle9.i, %add7.i - // CHECK: {{.*}} = bitcast <4 x i32> %add10.i to <2 x i64> - // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0 - // CHECK: %conv.i = trunc i64 %vecext.i to i32 - // CHECK: ret i32 %conv.i +// CHECK-LABEL: @test_mm512_mask_reduce_add_epi32( +// CHECK: bitcast i16 %{{.*}} to <16 x i1> +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} +// CHECK: bitcast <16 x i32> %{{.*}} to <8 x i64> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: add <8 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: add <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: add <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: add <4 x i32> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x i32> %{{.*}}, i32 0 return _mm512_mask_reduce_add_epi32(__M, __W); } int test_mm512_mask_reduce_mul_epi32(__mmask16 __M, __m512i __W){ - // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32> - // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1> - // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> - // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> - // CHECK: %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> - // CHECK: %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i - // CHECK: %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> - // CHECK: %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i - // CHECK: {{.*}} = bitcast <4 x i32> %mul10.i to <2 x i64> - // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0 - // CHECK: %conv.i = trunc i64 %vecext.i to i32 - // CHECK: ret i32 %conv.i +// CHECK-LABEL: @test_mm512_mask_reduce_mul_epi32( +// CHECK: bitcast i16 %{{.*}} to <16 x i1> +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} +// CHECK: bitcast <16 x i32> %{{.*}} to <8 x i64> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: mul <8 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: mul <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: mul <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: mul <4 x i32> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x i32> %{{.*}}, i32 0 return _mm512_mask_reduce_mul_epi32(__M, __W); } int test_mm512_mask_reduce_and_epi32(__mmask16 __M, __m512i __W){ - // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32> - // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1> - // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> - // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %and.i = and <8 x i32> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> - // CHECK: %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> - // CHECK: %and7.i = and <4 x i32> %shuffle6.i, %and4.i - // CHECK: %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> - // CHECK: %and10.i = and <4 x i32> %shuffle9.i, %and7.i - // CHECK: {{.*}} = bitcast <4 x i32> %and10.i to <2 x i64> - // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0 - // CHECK: %conv.i = trunc i64 %vecext.i to i32 - // CHECK: ret i32 %conv.i +// CHECK-LABEL: @test_mm512_mask_reduce_and_epi32( +// CHECK: bitcast i16 %{{.*}} to <16 x i1> +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} +// CHECK: bitcast <16 x i32> %{{.*}} to <8 x i64> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: and <8 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: and <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: and <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: and <4 x i32> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x i32> %{{.*}}, i32 0 return _mm512_mask_reduce_and_epi32(__M, __W); } int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){ - // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32> - // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1> - // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> zeroinitializer - // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> - // CHECK: %or.i = or <8 x i32> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> - // CHECK: %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> - // CHECK: %or7.i = or <4 x i32> %shuffle6.i, %or4.i - // CHECK: %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> - // CHECK: %or10.i = or <4 x i32> %shuffle9.i, %or7.i - // CHECK: {{.*}} = bitcast <4 x i32> %or10.i to <2 x i64> - // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0 - // CHECK: %conv.i = trunc i64 %vecext.i to i32 - // CHECK: ret i32 %conv.i +// CHECK-LABEL: @test_mm512_mask_reduce_or_epi32( +// CHECK: bitcast i16 %{{.*}} to <16 x i1> +// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} +// CHECK: bitcast <16 x i32> %{{.*}} to <8 x i64> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> +// CHECK: or <8 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> +// CHECK: or <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: or <4 x i32> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> +// CHECK: or <4 x i32> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x i32> %{{.*}}, i32 0 return _mm512_mask_reduce_or_epi32(__M, __W); } double test_mm512_reduce_add_pd(__m512d __W){ - // CHECK: %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> - // CHECK: %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> - // CHECK: %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> - // CHECK: %add7.i = fadd <2 x double> %add4.i, %shuffle6.i - // CHECK: %vecext.i = extractelement <2 x double> %add7.i, i32 0 - // CHECK: ret double %vecext.i +// CHECK-LABEL: @test_mm512_reduce_add_pd( +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: fadd <4 x double> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> +// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> +// CHECK: fadd <2 x double> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> +// CHECK: fadd <2 x double> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x double> %{{.*}}, i32 0 return _mm512_reduce_add_pd(__W); } double test_mm512_reduce_mul_pd(__m512d __W){ - // CHECK: %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> - // CHECK: %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> - // CHECK: %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> - // CHECK: %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i - // CHECK: %vecext.i = extractelement <2 x double> %mul7.i, i32 0 - // CHECK: ret double %vecext.i +// CHECK-LABEL: @test_mm512_reduce_mul_pd( +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: fmul <4 x double> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> +// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> +// CHECK: fmul <2 x double> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> +// CHECK: fmul <2 x double> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x double> %{{.*}}, i32 0 return _mm512_reduce_mul_pd(__W); } float test_mm512_reduce_add_ps(__m512 __W){ - // CHECK: %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> - // CHECK: %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> - // CHECK: %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> - // CHECK: %add7.i = fadd <4 x float> %add4.i, %shuffle6.i - // CHECK: %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> - // CHECK: %add10.i = fadd <4 x float> %add7.i, %shuffle9.i - // CHECK: %vecext.i = extractelement <4 x float> %add10.i, i32 0 - // CHECK: ret float %vecext.i +// CHECK-LABEL: @test_mm512_reduce_add_ps( +// CHECK: bitcast <16 x float> %{{.*}} to <8 x double> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> +// CHECK: fadd <8 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> +// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> +// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> +// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> +// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x float> %{{.*}}, i32 0 return _mm512_reduce_add_ps(__W); } float test_mm512_reduce_mul_ps(__m512 __W){ - // CHECK: %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> - // CHECK: %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> - // CHECK: %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> - // CHECK: %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i - // CHECK: %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> - // CHECK: %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i - // CHECK: %vecext.i = extractelement <4 x float> %mul10.i, i32 0 - // CHECK: ret float %vecext.i +// CHECK-LABEL: @test_mm512_reduce_mul_ps( +// CHECK: bitcast <16 x float> %{{.*}} to <8 x double> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> +// CHECK: fmul <8 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> +// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> +// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> +// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> +// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x float> %{{.*}}, i32 0 return _mm512_reduce_mul_ps(__W); } double test_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W){ - // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1> - // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x double> %__W, <8 x double> zeroinitializer - // CHECK: %shuffle.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> - // CHECK: %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> - // CHECK: %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> - // CHECK: %add7.i = fadd <2 x double> %add4.i, %shuffle6.i - // CHECK: %vecext.i = extractelement <2 x double> %add7.i, i32 0 - // CHECK: ret double %vecext.i +// CHECK-LABEL: @test_mm512_mask_reduce_add_pd( +// CHECK: bitcast i8 %{{.*}} to <8 x i1> +// CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: fadd <4 x double> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> +// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> +// CHECK: fadd <2 x double> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> +// CHECK: fadd <2 x double> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x double> %{{.*}}, i32 0 return _mm512_mask_reduce_add_pd(__M, __W); } double test_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W){ - // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1> - // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x double> %__W, <8 x double> - // CHECK: %shuffle.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> - // CHECK: %shuffle1.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> - // CHECK: %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> - // CHECK: %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> - // CHECK: %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> - // CHECK: %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i - // CHECK: %vecext.i = extractelement <2 x double> %mul7.i, i32 0 - // CHECK: ret double %vecext.i +// CHECK-LABEL: @test_mm512_mask_reduce_mul_pd( +// CHECK: bitcast i8 %{{.*}} to <8 x i1> +// CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: fmul <4 x double> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> +// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> +// CHECK: fmul <2 x double> %{{.*}}, %{{.*}} +// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> +// CHECK: fmul <2 x double> %{{.*}}, %{{.*}} +// CHECK: extractelement <2 x double> %{{.*}}, i32 0 return _mm512_mask_reduce_mul_pd(__M, __W); } float test_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W){ - // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1> - // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x float> %__W, <16 x float> zeroinitializer - // CHECK: %shuffle.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> - // CHECK: %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> - // CHECK: %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> - // CHECK: %add7.i = fadd <4 x float> %add4.i, %shuffle6.i - // CHECK: %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> - // CHECK: %add10.i = fadd <4 x float> %add7.i, %shuffle9.i - // CHECK: %vecext.i = extractelement <4 x float> %add10.i, i32 0 - // CHECK: ret float %vecext.i +// CHECK-LABEL: @test_mm512_mask_reduce_add_ps( +// CHECK-NEXT: entry: +// CHECK: bitcast i16 %{{.*}} to <16 x i1> +// CHECK: select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}} +// CHECK: bitcast <16 x float> %{{.*}} to <8 x double> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> +// CHECK: fadd <8 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> +// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> +// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> +// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> +// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x float> %{{.*}}, i32 0 return _mm512_mask_reduce_add_ps(__M, __W); } float test_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W){ - // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1> - // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x float> %__W, <16 x float> - // CHECK: %shuffle.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> - // CHECK: %shuffle1.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> - // CHECK: %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i - // CHECK: %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> - // CHECK: %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> - // CHECK: %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i - // CHECK: %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> - // CHECK: %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i - // CHECK: %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> - // CHECK: %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i - // CHECK: %vecext.i = extractelement <4 x float> %mul10.i, i32 0 - // CHECK: ret float %vecext.i +// CHECK-LABEL: @test_mm512_mask_reduce_mul_ps( +// CHECK: bitcast i16 %{{.*}} to <16 x i1> +// CHECK: select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> %{{.*}} +// CHECK: bitcast <16 x float> %{{.*}} to <8 x double> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> +// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> +// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> +// CHECK: fmul <8 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> +// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> +// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> +// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} +// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> +// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} +// CHECK: extractelement <4 x float> %{{.*}}, i32 0 return _mm512_mask_reduce_mul_ps(__M, __W); }