This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min) intrinsics to Clang .
ClosedPublic

Authored by m_zuckerman on Oct 26 2016, 7:31 AM.

Download Raw Diff

Details

Reviewers

AsafBadouh
delena
craig.topper
igorb
aymanmus

Commits

rG25eb42023355: [X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min)…
rC285493: [X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min)…
rL285493: [X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min)…

Summary

Vector-reduction arithmetic accepts vectors as inputs and produces scalars as outputs.
This class of vector operation forms the basis of many scientific computations.
In vector-reduction arithmetic, the evaluation off is independent of the order of the input elements of V.

Diff Detail

Event Timeline

m_zuckerman updated this revision to Diff 75883.Oct 26 2016, 7:31 AM

m_zuckerman retitled this revision from to [X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min) intrinsics to Clang . .

m_zuckerman updated this object.

m_zuckerman added reviewers: igorb, delena, AsafBadouh, aymanmus, craig.topper.

craig.topper added inline comments.Oct 26 2016, 9:28 PM

lib/Headers/avx512fintrin.h
344	Do we really need these new set1 macros? The epi ones should be fine shouldn't they?
9942	long long
9947	long long
9957	long long
9962	long long
9979	intialize is misspelled, but even then I don't think this sentence reads right.
9985	Use uppercase 0xFFF.... to match the constants below.
9996	Can we just call the set1 macro outside and pass the result in for Neutral instead of needing T4.
10015	Use uppercase for consistency.
10018	long long
10024	long long
10088	Do these 512-bit shuffles get narrowed to 256-bit and 128-bit ops for the later stages due to the high bit undefs or do we end up doing 512-bit operations all the way through?

m_zuckerman updated this revision to Diff 76032.Oct 27 2016, 8:01 AM

m_zuckerman marked 8 inline comments as done.

m_zuckerman added inline comments.Oct 27 2016, 12:51 PM

lib/Headers/avx512fintrin.h
10088	It will stay all the way 512. This intrinsics only defined on avx512F, and because of that, we can only use the 512bit intrinsics version of the max and min intrinsics.

LGTM with that 1 comment.

lib/Headers/avx512fintrin.h
10046	extra space after the 4th -1

This revision is now accepted and ready to land.Oct 28 2016, 9:39 PM

Closed by commit rL285493: [X86][AVX512][Clang][Intrinsics][reduce] Adding missing reduce (max|min)… (authored by mzuckerm). · Explain WhyOct 29 2016, 3:38 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Headers/

avx512fintrin.h

278 lines

test/

CodeGen/

avx512-reduceMinMaxIntrin.c

436 lines

Diff 76032

lib/Headers/avx512fintrin.h

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 335 Lines • ▼ Show 20 Lines
static __inline __m512i __DEFAULT_FN_ATTRS		static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_set1_epi32(int __s)		_mm512_set1_epi32(int __s)
{		{
return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,		return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
__s, __s, __s, __s, __s, __s, __s, __s };		__s, __s, __s, __s, __s, __s, __s, __s };
}		}

static __inline __m512i __DEFAULT_FN_ATTRS		static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_set1_epi64(long long __d)		_mm512_set1_epi64(long long __d)
		craig.topperUnsubmitted Not Done Reply Inline Actions Do we really need these new set1 macros? The epi ones should be fine shouldn't they? craig.topper: Do we really need these new set1 macros? The epi ones should be fine shouldn't they?
{		{
return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };		return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
}		}

static __inline__ __m512 __DEFAULT_FN_ATTRS		static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_broadcastss_ps(__m128 __A)		_mm512_broadcastss_ps(__m128 __A)
{		{
return (__m512)__builtin_shufflevector((__v4sf) __A,		return (__m512)__builtin_shufflevector((__v4sf) __A,
▲ Show 20 Lines • Show All 9,537 Lines • ▼ Show 20 Lines	_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
_mm512_mask_reduce_operator_32bit(__W, +, __M, 0, f, , ps);		_mm512_mask_reduce_operator_32bit(__W, +, __M, 0, f, , ps);
}		}

static __inline__ float __DEFAULT_FN_ATTRS		static __inline__ float __DEFAULT_FN_ATTRS
_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {		_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
_mm512_mask_reduce_operator_32bit(__W, *, __M, 1, f, , ps);		_mm512_mask_reduce_operator_32bit(__W, *, __M, 1, f, , ps);
}		}

		// Used bisection method. At each step, we partition the vector with previous
		// step in half, and the operation is performed on its two halves.
		// This takes log2(n) steps where n is the number of elements in the vector.
		// This macro uses only intrinsics from the AVX512F feature.

		// Vec512 - Vector with size of 512.
		// IntrinName - Can be one of following: {max\|min}_{epi64\|epu64\|pd} for example:
		// __mm512_max_epi64
		// T1 - Can get 'i' for int and 'd' for double.[__m512{i\|d}]
		// T2 - Can get 'i' for int and 'f' for float. [__v8d{i\|f}]

		#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		0, 1, 2, 3, -1, -1, -1, -1), \
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		4, 5, 6, 7, -1, -1, -1, -1)); \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		0, 1, -1, -1, -1, -1, -1, -1),\
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		2, 3, -1, -1, -1, -1, -1, \
		-1)); \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		0, -1, -1, -1, -1, -1, -1, -1),\
		(__m512##T1)__builtin_shufflevector( \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512, \
		1, -1, -1, -1, -1, -1, -1, -1))\
		; \
		return Vec512[0]; \
		})

		static __inline__ long long __DEFAULT_FN_ATTRS
		craig.topperUnsubmitted Done Reply Inline Actions long long craig.topper: long long
		_mm512_reduce_max_epi64(__m512i __V) {
		_mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
		}

		static __inline__ unsigned long long __DEFAULT_FN_ATTRS
		craig.topperUnsubmitted Done Reply Inline Actions long long craig.topper: long long
		_mm512_reduce_max_epu64(__m512i __V) {
		_mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
		}

		static __inline__ double __DEFAULT_FN_ATTRS
		_mm512_reduce_max_pd(__m512d __V) {
		_mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
		}

		static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64
		craig.topperUnsubmitted Done Reply Inline Actions long long craig.topper: long long
		(__m512i __V) {
		_mm512_reduce_maxMin_64bit(__V, min_epi64, i, i);
		}

		static __inline__ unsigned long long __DEFAULT_FN_ATTRS
		craig.topperUnsubmitted Done Reply Inline Actions long long craig.topper: long long
		_mm512_reduce_min_epu64(__m512i __V) {
		_mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
		}

		static __inline__ double __DEFAULT_FN_ATTRS
		_mm512_reduce_min_pd(__m512d __V) {
		_mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
		}

		// Vec512 - Vector with size 512.
		// Vec512Neutral - All vector elements set to the identity element.
		// Identity element: {max_epi,0x8000000000000000}
		// {max_epu,0x0000000000000000}
		// {max_pd, 0xFFF0000000000000}
		// {min_epi,0x7FFFFFFFFFFFFFFF}
		// {min_epu,0xFFFFFFFFFFFFFFFF}
		// {min_pd, 0x7FF0000000000000}
		craig.topperUnsubmitted Done Reply Inline Actions intialize is misspelled, but even then I don't think this sentence reads right. craig.topper: intialize is misspelled, but even then I don't think this sentence reads right.
		//
		// IntrinName - Can be one of following: {max\|min}_{epi64\|epu64\|pd} for example:
		// __mm512_max_epi64
		// T1 - Can get 'i' for int and 'd' for double.[__m512{i\|d}]
		// T2 - Can get 'i' for int and 'f' for float. [__v8d{i\|f}]
		// T3 - Can get 'q' q word and 'pd' for packed double.
		craig.topperUnsubmitted Not Done Reply Inline Actions Use uppercase 0xFFF.... to match the constants below. craig.topper: Use uppercase 0xFFF.... to match the constants below.
		// [__builtin_ia32_select{q\|pd}_512]
		// Mask - Intrinsic Mask

		#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \
		T2, T3, Mask) __extension__({ \
		Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
		(__mmask8)Mask, \
		(__v8d##T2)Vec512, \
		(__v8d##T2)Vec512Neutral); \
		_mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2); \
		})
		craig.topperUnsubmitted Not Done Reply Inline Actions Can we just call the set1 macro outside and pass the result in for Neutral instead of needing T4. craig.topper: Can we just call the set1 macro outside and pass the result in for Neutral instead of needing…

		static __inline__ long long __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000),
		max_epi64, i, i, q, __M);
		}

		static __inline__ unsigned long long __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000),
		max_epu64,i, i, q, __M);
		}

		static __inline__ double __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(0xFFF0000000000000),
		max_pd, d, f, pd, __M);
		}

		craig.topperUnsubmitted Done Reply Inline Actions Use uppercase for consistency. craig.topper: Use uppercase for consistency.
		static __inline__ long long __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
		craig.topperUnsubmitted Done Reply Inline Actions long long craig.topper: long long
		min_epi64, i, i, q, __M);
		}

		static __inline__ unsigned long long __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
		craig.topperUnsubmitted Done Reply Inline Actions long long craig.topper: long long
		min_epu64, i, i, q, __M);
		}

		static __inline__ double __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
		_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(0x7FF0000000000000),
		min_pd, d, f, pd, __M);
		}

		// Vec512 - Vector with size 512.
		// IntrinName - Can be one of following: {max\|min}_{epi32\|epu32\|ps} for example:
		// __mm512_max_epi32
		// T1 - Can get 'i' for int and ' ' .[__m512{i\|}]
		// T2 - Can get 'i' for int and 'f' for float.[__v16s{i\|f}]

		#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		0, 1, 2, 3, 4, 5, 6, 7, \
		-1, -1, -1, -1 , -1, -1, -1, -1), \
		craig.topperUnsubmitted Not Done Reply Inline Actions extra space after the 4th -1 craig.topper: extra space after the 4th -1
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		8, 9, 10, 11, 12, 13, 14, 15, \
		-1, -1, -1, -1, -1, -1, -1, -1)); \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		0, 1, 2, 3, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1), \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		4, 5, 6, 7, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1)); \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		0, 1, -1, -1, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1), \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		2, 3, -1, -1, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1)); \
		Vec512 = _mm512_##IntrinName( \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		0, -1, -1, -1, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1), \
		(__m512##T1)__builtin_shufflevector( \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512, \
		1, -1, -1, -1, -1, -1, -1, -1, \
		-1, -1, -1, -1, -1, -1, -1, -1)); \
		return Vec512[0]; \
		})

		static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) {
		craig.topperUnsubmitted Not Done Reply Inline Actions Do these 512-bit shuffles get narrowed to 256-bit and 128-bit ops for the later stages due to the high bit undefs or do we end up doing 512-bit operations all the way through? craig.topper: Do these 512-bit shuffles get narrowed to 256-bit and 128-bit ops for the later stages due to…
		m_zuckermanAuthorUnsubmitted Not Done Reply Inline Actions It will stay all the way 512. This intrinsics only defined on avx512F, and because of that, we can only use the 512bit intrinsics version of the max and min intrinsics. m_zuckerman: It will stay all the way 512. This intrinsics only defined on avx512F, and because of that, we…
		_mm512_reduce_maxMin_32bit(a, max_epi32, i, i);
		}

		static __inline__ unsigned int __DEFAULT_FN_ATTRS
		_mm512_reduce_max_epu32(__m512i a) {
		_mm512_reduce_maxMin_32bit(a, max_epu32, i, i);
		}

		static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) {
		_mm512_reduce_maxMin_32bit(a, max_ps, , f);
		}

		static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) {
		_mm512_reduce_maxMin_32bit(a, min_epi32, i, i);
		}

		static __inline__ unsigned int __DEFAULT_FN_ATTRS
		_mm512_reduce_min_epu32(__m512i a) {
		_mm512_reduce_maxMin_32bit(a, min_epu32, i, i);
		}

		static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) {
		_mm512_reduce_maxMin_32bit(a, min_ps, , f);
		}

		// Vec512 - Vector with size 512.
		// Vec512Neutral - All vector elements set to the identity element.
		// Identity element: {max_epi,0x80000000}
		// {max_epu,0x00000000}
		// {max_ps, 0xFF800000}
		// {min_epi,0x7FFFFFFF}
		// {min_epu,0xFFFFFFFF}
		// {min_ps, 0x7F800000}
		//
		// IntrinName - Can be one of following: {max\|min}_{epi32\|epu32\|ps} for example:
		// __mm512_max_epi32
		// T1 - Can get 'i' for int and ' ' .[__m512{i\|}]
		// T2 - Can get 'i' for int and 'f' for float.[__v16s{i\|f}]
		// T3 - Can get 'q' q word and 'pd' for packed double.
		// [__builtin_ia32_select{q\|pd}_512]
		// Mask - Intrinsic Mask

		#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \
		T2, T3, Mask) __extension__({ \
		Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
		(__mmask16)Mask, \
		(__v16s##T2)Vec512, \
		(__v16s##T2)Vec512Neutral); \
		_mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2); \
		})

		static __inline__ int __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32,
		i, i, d, __M);
		}

		static __inline__ unsigned int __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32
		, i, i, d, __M);
		}

		static __inline__ float __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(0xFF800000), max_ps
		, , f, ps, __M);
		}

		static __inline__ int __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32
		, i, i, d, __M);
		}

		static __inline__ unsigned int __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32
		, i, i, d, __M);
		}

		static __inline__ float __DEFAULT_FN_ATTRS
		_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
		_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(0x7F800000), min_ps
		, , f, ps, __M );
		}

#undef __DEFAULT_FN_ATTRS		#undef __DEFAULT_FN_ATTRS

#endif // __AVX512FINTRIN_H		#endif // __AVX512FINTRIN_H

test/CodeGen/avx512-reduceMinMaxIntrin.c

				// RUN: %clang_cc1 -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror \|opt -instnamer -S \|FileCheck %s

				#include <immintrin.h>

				long long test_mm512_reduce_max_epi64(__m512i __W){
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = icmp slt <8 x i64> %shuffle1.i, %__W
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle3.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle6.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
				// CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
				// CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_reduce_max_epi64(__W);
				}

				unsigned long long test_mm512_reduce_max_epu64(__m512i __W){
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = icmp ult <8 x i64> %shuffle1.i, %__W
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle3.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle6.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
				// CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
				// CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_reduce_max_epu64(__W);
				}

				double test_mm512_reduce_max_pd(__m512d __W){
				// CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %__W, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle3.i = shufflevector <8 x double> %tmp, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp, <8 x double> %shuffle3.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle6.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp1, <8 x double> %shuffle6.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <8 x double> %tmp2, i32 0
				// CHECK: ret double %vecext.i
				return _mm512_reduce_max_pd(__W);
				}

				long long test_mm512_reduce_min_epi64(__m512i __W){
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = icmp slt <8 x i64> %shuffle1.i, %__W
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle3.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle6.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
				// CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
				// CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_reduce_max_epi64(__W);
				}

				unsigned long long test_mm512_reduce_min_epu64(__m512i __W){
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = icmp ult <8 x i64> %shuffle1.i, %__W
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle3.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle6.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp4, i32 0
				// CHECK: %.elt20.i = extractelement <8 x i64> %tmp3, i32 0
				// CHECK: %shuffle6.elt.i = extractelement <8 x i64> %tmp3, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt20.i, i64 %shuffle6.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_reduce_max_epu64(__W);
				}

				double test_mm512_reduce_min_pd(__m512d __W){
				// CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %__W, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle3.i = shufflevector <8 x double> %tmp, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp, <8 x double> %shuffle3.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle6.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp1, <8 x double> %shuffle6.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <8 x double> %tmp2, i32 0
				// CHECK: ret double %vecext.i
				return _mm512_reduce_min_pd(__W);
				}

				long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp sgt <8 x i64> %tmp1, %shuffle1.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp sgt <8 x i64> %tmp3, %shuffle4.i
				// CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp6 = icmp sgt <8 x i64> %tmp5, %shuffle7.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
				// CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
				// CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_mask_reduce_max_epi64(__M, __W);
				}

				unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> zeroinitializer
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle1.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle4.i
				// CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp6 = icmp ugt <8 x i64> %tmp5, %shuffle7.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
				// CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
				// CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_mask_reduce_max_epu64(__M, __W);
				}

				long long test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x double> %__W, <8 x double> <double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000, double 0x43EFFE0000000000>
				// CHECK: %shuffle1.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp1, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle4.i = shufflevector <8 x double> %tmp2, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp2, <8 x double> %shuffle4.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle7.i = shufflevector <8 x double> %tmp3, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = tail call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %tmp3, <8 x double> %shuffle7.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <8 x double> %tmp4, i32 0
				// CHECK: %conv = fptosi double %vecext.i to i64
				// CHECK: ret i64 %conv
				return _mm512_mask_reduce_max_pd(__M, __W);
				}

				long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp slt <8 x i64> %tmp1, %shuffle1.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp slt <8 x i64> %tmp3, %shuffle4.i
				// CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp6 = icmp slt <8 x i64> %tmp5, %shuffle7.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
				// CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
				// CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_mask_reduce_min_epi64(__M, __W);
				}

				long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x i64> %__W, <8 x i64> zeroinitializer
				// CHECK: %shuffle1.i = shufflevector <8 x i64> %tmp1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = icmp ugt <8 x i64> %tmp1, %shuffle1.i
				// CHECK: %tmp3 = select <8 x i1> %tmp2, <8 x i64> %tmp1, <8 x i64> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <8 x i64> %tmp3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = icmp ugt <8 x i64> %tmp3, %shuffle4.i
				// CHECK: %tmp5 = select <8 x i1> %tmp4, <8 x i64> %tmp3, <8 x i64> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <8 x i64> %tmp5, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp6 = icmp ugt <8 x i64> %tmp5, %shuffle7.i
				// CHECK: %.elt.i = extractelement <8 x i1> %tmp6, i32 0
				// CHECK: %.elt22.i = extractelement <8 x i64> %tmp5, i32 0
				// CHECK: %shuffle7.elt.i = extractelement <8 x i64> %tmp5, i32 1
				// CHECK: %vecext.i = select i1 %.elt.i, i64 %.elt22.i, i64 %shuffle7.elt.i
				// CHECK: ret i64 %vecext.i
				return _mm512_mask_reduce_max_epu64(__M, __W);
				}

				double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){
				// CHECK: %tmp = bitcast i8 %__M to <8 x i1>
				// CHECK: %tmp1 = select <8 x i1> %tmp, <8 x double> %__W, <8 x double> <double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000, double 0x43DFFC0000000000>
				// CHECK: %shuffle1.i = shufflevector <8 x double> %tmp1, <8 x double> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp1, <8 x double> %shuffle1.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle4.i = shufflevector <8 x double> %tmp2, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp2, <8 x double> %shuffle4.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %shuffle7.i = shufflevector <8 x double> %tmp3, <8 x double> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = tail call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %tmp3, <8 x double> %shuffle7.i, <8 x double> zeroinitializer, i8 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <8 x double> %tmp4, i32 0
				// CHECK: ret double %vecext.i
				return _mm512_mask_reduce_min_pd(__M, __W);
				}

				int test_mm512_reduce_max_epi32(__m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = icmp sgt <16 x i32> %tmp, %shuffle1.i
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp sgt <16 x i32> %tmp2, %shuffle3.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp sgt <16 x i32> %tmp4, %shuffle6.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
				// CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp sgt <16 x i32> %tmp6, %shuffle9.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
				// CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_reduce_max_epi32(__W);
				}

				unsigned int test_mm512_reduce_max_epu32(__m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = icmp ugt <16 x i32> %tmp, %shuffle1.i
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp ugt <16 x i32> %tmp2, %shuffle3.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp ugt <16 x i32> %tmp4, %shuffle6.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
				// CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp ugt <16 x i32> %tmp6, %shuffle9.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
				// CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_reduce_max_epu32(__W);
				}

				float test_mm512_reduce_max_ps(__m512 __W){
				// CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %__W, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle3.i = shufflevector <16 x float> %tmp, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp, <16 x float> %shuffle3.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle6.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp1, <16 x float> %shuffle6.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle9.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp2, <16 x float> %shuffle9.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <16 x float> %tmp3, i32 0
				// CHECK: ret float %vecext.i
				return _mm512_reduce_max_ps(__W);
				}

				int test_mm512_reduce_min_epi32(__m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = icmp slt <16 x i32> %tmp, %shuffle1.i
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp slt <16 x i32> %tmp2, %shuffle3.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp slt <16 x i32> %tmp4, %shuffle6.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
				// CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp slt <16 x i32> %tmp6, %shuffle9.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
				// CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_reduce_min_epi32(__W);
				}

				unsigned int test_mm512_reduce_min_epu32(__m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = icmp ult <16 x i32> %tmp, %shuffle1.i
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> %shuffle1.i
				// CHECK: %shuffle3.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp ult <16 x i32> %tmp2, %shuffle3.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle3.i
				// CHECK: %shuffle6.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp ult <16 x i32> %tmp4, %shuffle6.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle6.i
				// CHECK: %shuffle9.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp ult <16 x i32> %tmp6, %shuffle9.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle9.i
				// CHECK: %tmp9 = bitcast <16 x i32> %tmp8 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp9, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_reduce_min_epu32(__W);
				}

				float test_mm512_reduce_min_ps(__m512 __W){
				// CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %__W, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle3.i = shufflevector <16 x float> %tmp, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp1 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp, <16 x float> %shuffle3.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle6.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp1, <16 x float> %shuffle6.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle9.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp2, <16 x float> %shuffle9.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <16 x float> %tmp3, i32 0
				// CHECK: ret float %vecext.i
				return _mm512_reduce_min_ps(__W);
				}

				int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp sgt <16 x i32> %tmp2, %shuffle1.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp sgt <16 x i32> %tmp4, %shuffle4.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp sgt <16 x i32> %tmp6, %shuffle7.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
				// CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp9 = icmp sgt <16 x i32> %tmp8, %shuffle10.i
				// CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
				// CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_mask_reduce_max_epi32(__M, __W);
				}

				unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> zeroinitializer
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp ugt <16 x i32> %tmp2, %shuffle1.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp ugt <16 x i32> %tmp4, %shuffle4.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp ugt <16 x i32> %tmp6, %shuffle7.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
				// CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp9 = icmp ugt <16 x i32> %tmp8, %shuffle10.i
				// CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
				// CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_mask_reduce_max_epu32(__M, __W);
				}

				float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){
				// CHECK: %tmp = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp1 = select <16 x i1> %tmp, <16 x float> %__W, <16 x float> <float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000, float 0x41EFF00000000000>
				// CHECK: %shuffle1.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp1, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle4.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp2, <16 x float> %shuffle4.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle7.i = shufflevector <16 x float> %tmp3, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp3, <16 x float> %shuffle7.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle10.i = shufflevector <16 x float> %tmp4, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = tail call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %tmp4, <16 x float> %shuffle10.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <16 x float> %tmp5, i32 0
				// CHECK: ret float %vecext.i
				return _mm512_mask_reduce_max_ps(__M, __W);
				}

				int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp slt <16 x i32> %tmp2, %shuffle1.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp slt <16 x i32> %tmp4, %shuffle4.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp slt <16 x i32> %tmp6, %shuffle7.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
				// CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp9 = icmp slt <16 x i32> %tmp8, %shuffle10.i
				// CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
				// CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_mask_reduce_min_epi32(__M, __W);
				}

				unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){
				// CHECK: %tmp = bitcast <8 x i64> %__W to <16 x i32>
				// CHECK: %tmp1 = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp2 = select <16 x i1> %tmp1, <16 x i32> %tmp, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
				// CHECK: %shuffle1.i = shufflevector <16 x i32> %tmp2, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = icmp ult <16 x i32> %tmp2, %shuffle1.i
				// CHECK: %tmp4 = select <16 x i1> %tmp3, <16 x i32> %tmp2, <16 x i32> %shuffle1.i
				// CHECK: %shuffle4.i = shufflevector <16 x i32> %tmp4, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = icmp ult <16 x i32> %tmp4, %shuffle4.i
				// CHECK: %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> %shuffle4.i
				// CHECK: %shuffle7.i = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp7 = icmp ult <16 x i32> %tmp6, %shuffle7.i
				// CHECK: %tmp8 = select <16 x i1> %tmp7, <16 x i32> %tmp6, <16 x i32> %shuffle7.i
				// CHECK: %shuffle10.i = shufflevector <16 x i32> %tmp8, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp9 = icmp ult <16 x i32> %tmp8, %shuffle10.i
				// CHECK: %tmp10 = select <16 x i1> %tmp9, <16 x i32> %tmp8, <16 x i32> %shuffle10.i
				// CHECK: %tmp11 = bitcast <16 x i32> %tmp10 to <8 x i64>
				// CHECK: %vecext.i = extractelement <8 x i64> %tmp11, i32 0
				// CHECK: %conv.i = trunc i64 %vecext.i to i32
				// CHECK: ret i32 %conv.i
				return _mm512_mask_reduce_min_epu32(__M, __W);
				}

				float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){
				// CHECK: %tmp = bitcast i16 %__M to <16 x i1>
				// CHECK: %tmp1 = select <16 x i1> %tmp, <16 x float> %__W, <16 x float> <float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000, float 0x41DFE00000000000>
				// CHECK: %shuffle1.i = shufflevector <16 x float> %tmp1, <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp2 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp1, <16 x float> %shuffle1.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle4.i = shufflevector <16 x float> %tmp2, <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp3 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp2, <16 x float> %shuffle4.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle7.i = shufflevector <16 x float> %tmp3, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp4 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp3, <16 x float> %shuffle7.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %shuffle10.i = shufflevector <16 x float> %tmp4, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				// CHECK: %tmp5 = tail call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %tmp4, <16 x float> %shuffle10.i, <16 x float> zeroinitializer, i16 -1, i32 4) #3
				// CHECK: %vecext.i = extractelement <16 x float> %tmp5, i32 0
				// CHECK: ret float %vecext.i
				return _mm512_mask_reduce_min_ps(__M, __W);
				}