This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min) intrinsics to Clang .
ClosedPublic

Authored by m_zuckerman on Oct 26 2016, 7:31 AM.

Download Raw Diff

Details

Reviewers

AsafBadouh
delena
craig.topper
igorb
aymanmus

Commits

rG25eb42023355: [X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min)…
rC285493: [X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min)…
rL285493: [X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min)…

Summary

Vector-reduction arithmetic accepts vectors as inputs and produces scalars as outputs.
This class of vector operation forms the basis of many scientific computations.
In vector-reduction arithmetic, the evaluation off is independent of the order of the input elements of V.

Diff Detail

Repository: rL LLVM

Event Timeline

m_zuckerman updated this revision to Diff 75883.Oct 26 2016, 7:31 AM

m_zuckerman retitled this revision from to [X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min) intrinsics to Clang . .

m_zuckerman updated this object.

m_zuckerman added reviewers: igorb, delena, AsafBadouh, aymanmus, craig.topper.

craig.topper added inline comments.Oct 26 2016, 9:28 PM

lib/Headers/avx512fintrin.h
344 ↗	(On Diff #75883)	Do we really need these new set1 macros? The epi ones should be fine shouldn't they?
9955 ↗	(On Diff #75883)	long long
9960 ↗	(On Diff #75883)	long long
9970 ↗	(On Diff #75883)	long long
9975 ↗	(On Diff #75883)	long long
9992 ↗	(On Diff #75883)	intialize is misspelled, but even then I don't think this sentence reads right.
9998 ↗	(On Diff #75883)	Use uppercase 0xFFF.... to match the constants below.
10009 ↗	(On Diff #75883)	Can we just call the set1 macro outside and pass the result in for Neutral instead of needing T4.
10028 ↗	(On Diff #75883)	Use uppercase for consistency.
10031 ↗	(On Diff #75883)	long long
10037 ↗	(On Diff #75883)	long long
10101 ↗	(On Diff #75883)	Do these 512-bit shuffles get narrowed to 256-bit and 128-bit ops for the later stages due to the high bit undefs or do we end up doing 512-bit operations all the way through?

m_zuckerman updated this revision to Diff 76032.Oct 27 2016, 8:01 AM

m_zuckerman marked 8 inline comments as done.

m_zuckerman added inline comments.Oct 27 2016, 12:51 PM

lib/Headers/avx512fintrin.h
10101 ↗	(On Diff #75883)	It will stay all the way 512. This intrinsics only defined on avx512F, and because of that, we can only use the 512bit intrinsics version of the max and min intrinsics.

LGTM with that 1 comment.

lib/Headers/avx512fintrin.h
10046 ↗	(On Diff #76032)	extra space after the 4th -1

This revision is now accepted and ready to land.Oct 28 2016, 9:39 PM

Closed by commit rL285493: [X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min)… (authored by mzuckerm). · Explain WhyOct 29 2016, 3:38 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

cfe/

trunk/

lib/

Headers/

avx512fintrin.h

280 lines

test/

CodeGen/

avx512-reduceMinMaxIntrin.c

437 lines

Diff 76294

cfe/trunk/lib/Headers/avx512fintrin.h

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 9,898 Lines • ▼ Show 20 Lines	_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps);		_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps);
}		}

static __inline__ float __DEFAULT_FN_ATTRS		static __inline__ float __DEFAULT_FN_ATTRS
_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {		_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps);		_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps);
}		}

		// Used bisection method. At each step, we partition the vector with previous
		// step in half, and the operation is performed on its two halves.
		// This takes log2(n) steps where n is the number of elements in the vector.
		// This macro uses only intrinsics from the AVX512F feature.

		// Vec512 - Vector with size of 512.
		// IntrinName - Can be one of following: {max\|min}_{epi64\|epu64\|pd} for example:
		// __mm512_max_epi64
		// T1 - Can get 'i' for int and 'd' for double.[__m512{i\|d}]
		// T2 - Can get 'i' for int and 'f' for float. [__v8d{i\|f}]

		#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		0, 1, 2, 3, -1, -1, -1, -1), \
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		4, 5, 6, 7, -1, -1, -1, -1)); \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		0, 1, -1, -1, -1, -1, -1, -1),\
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		2, 3, -1, -1, -1, -1, -1, \
		-1)); \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		0, -1, -1, -1, -1, -1, -1, -1),\
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		1, -1, -1, -1, -1, -1, -1, -1))\
		; \
		return Vec512[0]; \
		})

		static __inline__ long long __DEFAULT_FN_ATTRS
		_mm512_reduce_max_epi64(__m512i __V) {
		_mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
		}

		static __inline__ unsigned long long __DEFAULT_FN_ATTRS
		_mm512_reduce_max_epu64(__m512i __V) {
		_mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
		}

		static __inline__ double __DEFAULT_FN_ATTRS
		_mm512_reduce_max_pd(__m512d __V) {
		_mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
		}

		static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64
		(__m512i __V) {
		_mm512_reduce_maxMin_64bit(__V, min_epi64, i, i);
		}

		static __inline__ unsigned long long __DEFAULT_FN_ATTRS
		_mm512_reduce_min_epu64(__m512i __V) {
		_mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
		}

		static __inline__ double __DEFAULT_FN_ATTRS
		_mm512_reduce_min_pd(__m512d __V) {
		_mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
		}

		// Vec512 - Vector with size 512.
		// Vec512Neutral - A 512 length vector with elements set to the identity element
		// Identity element: {max_epi,0x8000000000000000}
		// {max_epu,0x0000000000000000}
		// {max_pd, 0xFFF0000000000000}
		// {min_epi,0x7FFFFFFFFFFFFFFF}
		// {min_epu,0xFFFFFFFFFFFFFFFF}
		// {min_pd, 0x7FF0000000000000}
		//
		// IntrinName - Can be one of following: {max\|min}_{epi64\|epu64\|pd} for example:
		// __mm512_max_epi64
		// T1 - Can get 'i' for int and 'd' for double.[__m512{i\|d}]
		// T2 - Can get 'i' for int and 'f' for float. [__v8d{i\|f}]
		// T3 - Can get 'q' q word and 'pd' for packed double.
		// [__builtin_ia32_select{q\|pd}_512]
		// Mask - Intrinsic Mask

		#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \
		T2, T3, Mask) \
		__extension__({ \
		Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
		(__mmask8)Mask, \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512Neutral); \
		_mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2); \
		})

		static __inline__ long long __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000),
		max_epi64, i, i, q, __M);
		}

		static __inline__ unsigned long long __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000),
		max_epu64, i, i, q, __M);
		}

		static __inline__ double __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(0xFFF0000000000000),
		max_pd, d, f, pd, __M);
		}

		static __inline__ long long __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
		min_epi64, i, i, q, __M);
		}

		static __inline__ unsigned long long __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
		min_epu64, i, i, q, __M);
		}

		static __inline__ double __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(0x7FF0000000000000),
		min_pd, d, f, pd, __M);
		}

		// Vec512 - Vector with size 512.
		// IntrinName - Can be one of following: {max\|min}_{epi32\|epu32\|ps} for example:
		// __mm512_max_epi32
		// T1 - Can get 'i' for int and ' ' .[__m512{i\|}]
		// T2 - Can get 'i' for int and 'f' for float.[__v16s{i\|f}]

		#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		0, 1, 2, 3, 4, 5, 6, 7, \
		-1, -1, -1, -1, -1, -1, -1, -1), \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		8, 9, 10, 11, 12, 13, 14, 15, \
		-1, -1, -1, -1, -1, -1, -1, -1)); \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		0, 1, 2, 3, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1), \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		4, 5, 6, 7, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1)); \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		0, 1, -1, -1, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1), \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		2, 3, -1, -1, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1)); \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		0, -1, -1, -1, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1), \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		1, -1, -1, -1, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1)); \
		return Vec512[0]; \
		})

		static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) {
		_mm512_reduce_maxMin_32bit(a, max_epi32, i, i);
		}

		static __inline__ unsigned int __DEFAULT_FN_ATTRS
		_mm512_reduce_max_epu32(__m512i a) {
		_mm512_reduce_maxMin_32bit(a, max_epu32, i, i);
		}

		static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) {
		_mm512_reduce_maxMin_32bit(a, max_ps, , f);
		}

		static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) {
		_mm512_reduce_maxMin_32bit(a, min_epi32, i, i);
		}

		static __inline__ unsigned int __DEFAULT_FN_ATTRS
		_mm512_reduce_min_epu32(__m512i a) {
		_mm512_reduce_maxMin_32bit(a, min_epu32, i, i);
		}

		static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) {
		_mm512_reduce_maxMin_32bit(a, min_ps, , f);
		}

		// Vec512 - Vector with size 512.
		// Vec512Neutral - A 512 length vector with elements set to the identity element
		// Identity element: {max_epi,0x80000000}
		// {max_epu,0x00000000}
		// {max_ps, 0xFF800000}
		// {min_epi,0x7FFFFFFF}
		// {min_epu,0xFFFFFFFF}
		// {min_ps, 0x7F800000}
		//
		// IntrinName - Can be one of following: {max\|min}_{epi32\|epu32\|ps} for example:
		// __mm512_max_epi32
		// T1 - Can get 'i' for int and ' ' .[__m512{i\|}]
		// T2 - Can get 'i' for int and 'f' for float.[__v16s{i\|f}]
		// T3 - Can get 'q' q word and 'pd' for packed double.
		// [__builtin_ia32_select{q\|pd}_512]
		// Mask - Intrinsic Mask

		#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \
		T2, T3, Mask) \
		__extension__({ \
		Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
		(__mmask16)Mask, \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512Neutral); \
		_mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2); \
		})

		static __inline__ int __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32,
		i, i, d, __M);
		}

		static __inline__ unsigned int __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32,
		i, i, d, __M);
		}

		static __inline__ float __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(0xFF800000), max_ps, , f,
		ps, __M);
		}

		static __inline__ int __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32,
		i, i, d, __M);
		}

		static __inline__ unsigned int __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32,
		i, i, d, __M);
		}

		static __inline__ float __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(0x7F800000), min_ps, , f,
		ps, __M);
		}

#undef __DEFAULT_FN_ATTRS		#undef __DEFAULT_FN_ATTRS

#endif // __AVX512FINTRIN_H		#endif // __AVX512FINTRIN_H

cfe/trunk/test/CodeGen/avx512-reduceMinMaxIntrin.c

				// RUN: %clang_cc1 -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror \|opt -instnamer -S \|FileCheck %s

				#include <immintrin.h>

				long long test_mm512_reduce_max_epi64(__m512i __W){
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = icmp slt <8 x i64> %shuffle1.i, %__W
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle3.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle6.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
				// CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
				// CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_reduce_max_epi64(__W);
				}

				unsigned long long test_mm512_reduce_max_epu64(__m512i __W){
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = icmp ult <8 x i64> %shuffle1.i, %__W
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle3.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle6.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
				// CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
				// CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_reduce_max_epu64(__W);
				}

				double test_mm512_reduce_max_pd(__m512d __W){
				// CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %__W, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle3.i = shufflevector <8 x double> %tmp, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp, <8 x double> %shuffle3.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle6.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp1, <8 x double> %shuffle6.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <8 x double> %tmp2, i32 0
				// CHECK: ret double %vecext.i
				return _mm512_reduce_max_pd(__W);
				}

				long long test_mm512_reduce_min_epi64(__m512i __W){
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = icmp slt <8 x i64> %shuffle1.i, %__W
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle3.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle6.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
				// CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
				// CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_reduce_max_epi64(__W);
				}

				unsigned long long test_mm512_reduce_min_epu64(__m512i __W){
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = icmp ult <8 x i64> %shuffle1.i, %__W
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle3.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle6.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
				// CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
				// CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_reduce_max_epu64(__W);
				}

				double test_mm512_reduce_min_pd(__m512d __W){
				// CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %__W, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle3.i = shufflevector <8 x double> %tmp, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp, <8 x double> %shuffle3.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle6.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp1, <8 x double> %shuffle6.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <8 x double> %tmp2, i32 0
				// CHECK: ret double %vecext.i
				return _mm512_reduce_min_pd(__W);
				}

				long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle1.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle4.i
				// CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp6 = icmp sgt <8 x i64> %tmp5, %shuffle7.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
				// CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
				// CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_mask_reduce_max_epi64(__M, __W);
				}

				unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> zeroinitializer
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle1.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle4.i
				// CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp6 = icmp ugt <8 x i64> %tmp5, %shuffle7.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
				// CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
				// CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_mask_reduce_max_epu64(__M, __W);
				}

				long long test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x double> %__W, <8 x double> <double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000>
				// CHECK: %shuffle1.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp1, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle4.i = shufflevector <8 x double> %tmp2, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp2, <8 x double> %shuffle4.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle7.i = shufflevector <8 x double> %tmp3, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp3, <8 x double> %shuffle7.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <8 x double> %tmp4, i32 0
				// CHECK: %conv = fptosi double %vecext.i to i64
				// CHECK: ret i64 %conv
				return _mm512_mask_reduce_max_pd(__M, __W);
				}

				long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp slt <8 x i64> %tmp1, %shuffle1.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp slt <8 x i64> %tmp3, %shuffle4.i
				// CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp6 = icmp slt <8 x i64> %tmp5, %shuffle7.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
				// CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
				// CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_mask_reduce_min_epi64(__M, __W);
				}

				long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> zeroinitializer
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle1.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle4.i
				// CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp6 = icmp ugt <8 x i64> %tmp5, %shuffle7.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
				// CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
				// CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_mask_reduce_max_epu64(__M, __W);
				}

				double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x double> %__W, <8 x double> <double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000>
				// CHECK: %shuffle1.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp1, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle4.i = shufflevector <8 x double> %tmp2, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp2, <8 x double> %shuffle4.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle7.i = shufflevector <8 x double> %tmp3, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp3, <8 x double> %shuffle7.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <8 x double> %tmp4, i32 0
				// CHECK: ret double %vecext.i
				return _mm512_mask_reduce_min_pd(__M, __W);
				}

				int test_mm512_reduce_max_epi32(__m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = icmp sgt <16 x i32> %tmp, %shuffle1.i
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp sgt <16 x i32> %tmp2, %shuffle3.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp sgt <16 x i32> %tmp4, %shuffle6.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
				// CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp sgt <16 x i32> %tmp6, %shuffle9.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
				// CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_reduce_max_epi32(__W);
				}

				unsigned int test_mm512_reduce_max_epu32(__m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = icmp ugt <16 x i32> %tmp, %shuffle1.i
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp ugt <16 x i32> %tmp2, %shuffle3.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp ugt <16 x i32> %tmp4, %shuffle6.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
				// CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp ugt <16 x i32> %tmp6, %shuffle9.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
				// CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_reduce_max_epu32(__W);
				}

				float test_mm512_reduce_max_ps(__m512 __W){
				// CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %__W, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle3.i = shufflevector <16 x float> %tmp, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp, <16 x float> %shuffle3.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle6.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp1, <16 x float> %shuffle6.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle9.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp2, <16 x float> %shuffle9.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <16 x float> %tmp3, i32 0
				// CHECK: ret float %vecext.i
				return _mm512_reduce_max_ps(__W);
				}

				int test_mm512_reduce_min_epi32(__m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = icmp slt <16 x i32> %tmp, %shuffle1.i
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp slt <16 x i32> %tmp2, %shuffle3.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp slt <16 x i32> %tmp4, %shuffle6.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
				// CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp slt <16 x i32> %tmp6, %shuffle9.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
				// CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_reduce_min_epi32(__W);
				}

				unsigned int test_mm512_reduce_min_epu32(__m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = icmp ult <16 x i32> %tmp, %shuffle1.i
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp ult <16 x i32> %tmp2, %shuffle3.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp ult <16 x i32> %tmp4, %shuffle6.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
				// CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp ult <16 x i32> %tmp6, %shuffle9.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
				// CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_reduce_min_epu32(__W);
				}

				float test_mm512_reduce_min_ps(__m512 __W){
				// CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %__W, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle3.i = shufflevector <16 x float> %tmp, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp, <16 x float> %shuffle3.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle6.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp1, <16 x float> %shuffle6.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle9.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp2, <16 x float> %shuffle9.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <16 x float> %tmp3, i32 0
				// CHECK: ret float %vecext.i
				return _mm512_reduce_min_ps(__W);
				}

				int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp sgt <16 x i32> %tmp2, %shuffle1.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp sgt <16 x i32> %tmp4, %shuffle4.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp sgt <16 x i32> %tmp6, %shuffle7.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
				// CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp9 = icmp sgt <16 x i32> %tmp8, %shuffle10.i
				// CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
				// CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_mask_reduce_max_epi32(__M, __W);
				}

				unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> zeroinitializer
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp ugt <16 x i32> %tmp2, %shuffle1.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp ugt <16 x i32> %tmp4, %shuffle4.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp ugt <16 x i32> %tmp6, %shuffle7.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
				// CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp9 = icmp ugt <16 x i32> %tmp8, %shuffle10.i
				// CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
				// CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_mask_reduce_max_epu32(__M, __W);
				}

				float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){
				// CHECK: %tmp = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp1 = select <16 x i1> %tmp, <16 x float> %__W, <16 x float> <float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000>
				// CHECK: %shuffle1.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp1, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle4.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp2, <16 x float> %shuffle4.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle7.i = shufflevector <16 x float> %tmp3, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp3, <16 x float> %shuffle7.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle10.i = shufflevector <16 x float> %tmp4, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp4, <16 x float> %shuffle10.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <16 x float> %tmp5, i32 0
				// CHECK: ret float %vecext.i
				return _mm512_mask_reduce_max_ps(__M, __W);
				}

				int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp slt <16 x i32> %tmp2, %shuffle1.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp slt <16 x i32> %tmp4, %shuffle4.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp slt <16 x i32> %tmp6, %shuffle7.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
				// CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp9 = icmp slt <16 x i32> %tmp8, %shuffle10.i
				// CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
				// CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_mask_reduce_min_epi32(__M, __W);
				}

				unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp ult <16 x i32> %tmp2, %shuffle1.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp ult <16 x i32> %tmp4, %shuffle4.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp ult <16 x i32> %tmp6, %shuffle7.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
				// CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp9 = icmp ult <16 x i32> %tmp8, %shuffle10.i
				// CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
				// CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_mask_reduce_min_epu32(__M, __W);
				}

				float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){
				// CHECK: %tmp = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp1 = select <16 x i1> %tmp, <16 x float> %__W, <16 x float> <float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000>
				// CHECK: %shuffle1.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp1, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle4.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp2, <16 x float> %shuffle4.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle7.i = shufflevector <16 x float> %tmp3, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp3, <16 x float> %shuffle7.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle10.i = shufflevector <16 x float> %tmp4, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp4, <16 x float> %shuffle10.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <16 x float> %tmp5, i32 0
				// CHECK: ret float %vecext.i
				return _mm512_mask_reduce_min_ps(__M, __W);
				}