This is an archive of the discontinued LLVM Phabricator instance.

Differential D87604

[X86] Convert integer _mm_reduce_* intrinsics to emit llvm.reduction intrinsics (PR47506)
ClosedPublic

Authored by RKSimon on Sep 14 2020, 4:41 AM.

Download Raw Diff

Details

Reviewers

craig.topper

Commits

rG6c23cbc5603c: [X86] Convert integer _mm_reduce_* intrinsics to emit llvm.reduction intrinsics…

Summary

Placeholder patch for when the reduction intrinsics drop their experimental status, emitting the equivalent reduction intrinsic in IR instead of expanding to shuffle+arithmetic sequences.

The fadd/fmul reductions might be trickier as they assume a similar bisection reduction while the generic intrinsics assume a sequential reduction (intel docs are ambiguous on the correct approach) - I'm not sure if we want to always tag them with reassoc? Anyway, that issue can wait until a separate fp patch along with the fmin/fmax reductions.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

RKSimon created this revision.Sep 14 2020, 4:41 AM

Herald added a project: Restricted Project. · View Herald TranscriptSep 14 2020, 4:41 AM

RKSimon requested review of this revision.Sep 14 2020, 4:41 AM

Harbormaster completed remote builds in B71548: Diff 291539.Sep 14 2020, 5:14 AM

Longer term should we have a variadic version of this like builtin_shufflevector or builtin_convertvector that can handle any reduction?

In D87604#2272211, @craig.topper wrote:

Longer term should we have a variadic version of this like builtin_shufflevector or builtin_convertvector that can handle any reduction?

Yes, I was looking at what would be necessary for that as well - I think that is what is proposed in PR36691.

WIP until reductions are no longer experimental

rebased now that the reduction intrinsics are no longer experiemental

Herald added a subscriber: pengfei. · View Herald TranscriptOct 8 2020, 5:03 AM

Harbormaster completed remote builds in B74425: Diff 296930.Oct 8 2020, 5:35 AM

Ping?

LGTM

This revision is now accepted and ready to land.Oct 12 2020, 1:30 PM

This revision was landed with ongoing or failed builds.Oct 13 2020, 1:32 AM

Closed by commit rG6c23cbc5603c: [X86] Convert integer _mm_reduce_* intrinsics to emit llvm.reduction intrinsics… (authored by RKSimon). · Explain Why

This revision was automatically updated to reflect the committed changes.

RKSimon added a commit: rG6c23cbc5603c: [X86] Convert integer _mm_reduce_* intrinsics to emit llvm.reduction intrinsics….

RKSimon mentioned this in D92940: [X86] Convert fadd/fmul _mm_reduce_* intrinsics to emit llvm.reduction intrinsics (PR47506).Dec 9 2020, 7:25 AM

RKSimon mentioned this in rG4855a1004d4d: [X86] Convert fadd/fmul _mm_reduce_* intrinsics to emit llvm.reduction….Dec 13 2020, 7:39 AM

Revision Contents

Path

Size

clang/

include/

clang/

Basic/

BuiltinsX86.def

18 lines

lib/

CodeGen/

CGBuiltin.cpp

50 lines

Headers/

avx512fintrin.h

115 lines

test/

CodeGen/

X86/

avx512-reduceIntrin.c

181 lines

avx512-reduceMinMaxIntrin.c

210 lines

Diff 291539

clang/include/clang/Basic/BuiltinsX86.def

	Show First 20 Lines • Show All 1,865 Lines • ▼ Show 20 Lines
	TARGET_BUILTIN(__builtin_ia32_selectps_256, "V8fUcV8fV8f", "ncV:256:", "avx512vl")			TARGET_BUILTIN(__builtin_ia32_selectps_256, "V8fUcV8fV8f", "ncV:256:", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_selectps_512, "V16fUsV16fV16f", "ncV:512:", "avx512f")			TARGET_BUILTIN(__builtin_ia32_selectps_512, "V16fUsV16fV16f", "ncV:512:", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_selectpd_128, "V2dUcV2dV2d", "ncV:128:", "avx512vl")			TARGET_BUILTIN(__builtin_ia32_selectpd_128, "V2dUcV2dV2d", "ncV:128:", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "ncV:256:", "avx512vl")			TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "ncV:256:", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "ncV:512:", "avx512f")			TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "ncV:512:", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_selectss_128, "V4fUcV4fV4f", "ncV:128:", "avx512f")			TARGET_BUILTIN(__builtin_ia32_selectss_128, "V4fUcV4fV4f", "ncV:128:", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_selectsd_128, "V2dUcV2dV2d", "ncV:128:", "avx512f")			TARGET_BUILTIN(__builtin_ia32_selectsd_128, "V2dUcV2dV2d", "ncV:128:", "avx512f")

				// generic reduction intrinsics
				TARGET_BUILTIN(__builtin_ia32_reduce_add_d512, "iV16i", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_add_q512, "OiV8Oi", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_and_d512, "iV16i", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_and_q512, "OiV8Oi", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_mul_d512, "iV16i", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_mul_q512, "OiV8Oi", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_or_d512, "iV16i", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_or_q512, "OiV8Oi", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_smax_d512, "iV16i", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_smax_q512, "OiV8Oi", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_smin_d512, "iV16i", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_smin_q512, "OiV8Oi", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_umax_d512, "iV16i", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_umax_q512, "OiV8Oi", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_umin_d512, "iV16i", "ncV:512:", "avx512f")
				TARGET_BUILTIN(__builtin_ia32_reduce_umin_q512, "OiV8Oi", "ncV:512:", "avx512f")

	// MONITORX/MWAITX			// MONITORX/MWAITX
	TARGET_BUILTIN(__builtin_ia32_monitorx, "vvC*UiUi", "n", "mwaitx")			TARGET_BUILTIN(__builtin_ia32_monitorx, "vvC*UiUi", "n", "mwaitx")
	TARGET_BUILTIN(__builtin_ia32_mwaitx, "vUiUiUi", "n", "mwaitx")			TARGET_BUILTIN(__builtin_ia32_mwaitx, "vUiUiUi", "n", "mwaitx")

	// WAITPKG			// WAITPKG
	TARGET_BUILTIN(__builtin_ia32_umonitor, "vvC*", "n", "waitpkg")			TARGET_BUILTIN(__builtin_ia32_umonitor, "vvC*", "n", "waitpkg")
	TARGET_BUILTIN(__builtin_ia32_umwait, "UcUiUiUi", "n", "waitpkg")			TARGET_BUILTIN(__builtin_ia32_umwait, "UcUiUiUi", "n", "waitpkg")
	TARGET_BUILTIN(__builtin_ia32_tpause, "UcUiUiUi", "n", "waitpkg")			TARGET_BUILTIN(__builtin_ia32_tpause, "UcUiUiUi", "n", "waitpkg")
	▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines

clang/lib/CodeGen/CGBuiltin.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 9,991 Lines • ▼ Show 20 Lines
	case X86::BI__builtin_ia32_vpshrdvq256:			case X86::BI__builtin_ia32_vpshrdvq256:
	case X86::BI__builtin_ia32_vpshrdvq512:			case X86::BI__builtin_ia32_vpshrdvq512:
	case X86::BI__builtin_ia32_vpshrdvw128:			case X86::BI__builtin_ia32_vpshrdvw128:
	case X86::BI__builtin_ia32_vpshrdvw256:			case X86::BI__builtin_ia32_vpshrdvw256:
	case X86::BI__builtin_ia32_vpshrdvw512:			case X86::BI__builtin_ia32_vpshrdvw512:
	// Ops 0 and 1 are swapped.			// Ops 0 and 1 are swapped.
	return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);			return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);

				// Reductions
				case X86::BI__builtin_ia32_reduce_add_d512:
				case X86::BI__builtin_ia32_reduce_add_q512: {
				Function *F = CGM.getIntrinsic(Intrinsic::experimental_vector_reduce_add,
				Ops[0]->getType());
				return Builder.CreateCall(F, {Ops[0]});
				}
				case X86::BI__builtin_ia32_reduce_and_d512:
				case X86::BI__builtin_ia32_reduce_and_q512: {
				Function *F = CGM.getIntrinsic(Intrinsic::experimental_vector_reduce_and,
				Ops[0]->getType());
				return Builder.CreateCall(F, {Ops[0]});
				}
				case X86::BI__builtin_ia32_reduce_mul_d512:
				case X86::BI__builtin_ia32_reduce_mul_q512: {
				Function *F = CGM.getIntrinsic(Intrinsic::experimental_vector_reduce_mul,
				Ops[0]->getType());
				return Builder.CreateCall(F, {Ops[0]});
				}
				case X86::BI__builtin_ia32_reduce_or_d512:
				case X86::BI__builtin_ia32_reduce_or_q512: {
				Function *F = CGM.getIntrinsic(Intrinsic::experimental_vector_reduce_or,
				Ops[0]->getType());
				return Builder.CreateCall(F, {Ops[0]});
				}
				case X86::BI__builtin_ia32_reduce_smax_d512:
				case X86::BI__builtin_ia32_reduce_smax_q512: {
				Function *F = CGM.getIntrinsic(Intrinsic::experimental_vector_reduce_smax,
				Ops[0]->getType());
				return Builder.CreateCall(F, {Ops[0]});
				}
				case X86::BI__builtin_ia32_reduce_smin_d512:
				case X86::BI__builtin_ia32_reduce_smin_q512: {
				Function *F = CGM.getIntrinsic(Intrinsic::experimental_vector_reduce_smin,
				Ops[0]->getType());
				return Builder.CreateCall(F, {Ops[0]});
				}
				case X86::BI__builtin_ia32_reduce_umax_d512:
				case X86::BI__builtin_ia32_reduce_umax_q512: {
				Function *F = CGM.getIntrinsic(Intrinsic::experimental_vector_reduce_umax,
				Ops[0]->getType());
				return Builder.CreateCall(F, {Ops[0]});
				}
				case X86::BI__builtin_ia32_reduce_umin_d512:
				case X86::BI__builtin_ia32_reduce_umin_q512: {
				Function *F = CGM.getIntrinsic(Intrinsic::experimental_vector_reduce_umin,
				Ops[0]->getType());
				return Builder.CreateCall(F, {Ops[0]});
				}

	// 3DNow!			// 3DNow!
	case X86::BI__builtin_ia32_pswapdsf:			case X86::BI__builtin_ia32_pswapdsf:
	case X86::BI__builtin_ia32_pswapdsi: {			case X86::BI__builtin_ia32_pswapdsi: {
	llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext());			llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext());
	Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast");			Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast");
	llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd);			llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd);
	return Builder.CreateCall(F, Ops, "pswapd");			return Builder.CreateCall(F, Ops, "pswapd");
	}			}
	▲ Show 20 Lines • Show All 3,323 Lines • Show Last 20 Lines

clang/lib/Headers/avx512fintrin.h

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===			/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
	*			*
	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	* See https://llvm.org/LICENSE.txt for license information.			* See https://llvm.org/LICENSE.txt for license information.
	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	*			*
	*===-----------------------------------------------------------------------===			*===-----------------------------------------------------------------------===
	*/			*/
	#ifndef __IMMINTRIN_H			#ifndef __IMMINTRIN_H
	#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."			#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
				Lint: Pre-merge checks Inline Actions clang-tidy: error: "Never use <avx512fintrin.h> directly; include <immintrin.h> instead." [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."…
	#endif			#endif

	#ifndef __AVX512FINTRIN_H			#ifndef __AVX512FINTRIN_H
	#define __AVX512FINTRIN_H			#define __AVX512FINTRIN_H

	typedef char __v64qi __attribute__((__vector_size__(64)));			typedef char __v64qi __attribute__((__vector_size__(64)));
	typedef short __v32hi __attribute__((__vector_size__(64)));			typedef short __v32hi __attribute__((__vector_size__(64)));
	typedef double __v8df __attribute__((__vector_size__(64)));			typedef double __v8df __attribute__((__vector_size__(64)));
	▲ Show 20 Lines • Show All 178 Lines • ▼ Show 20 Lines

	static __inline__ __m512i __DEFAULT_FN_ATTRS512			static __inline__ __m512i __DEFAULT_FN_ATTRS512
	_mm512_undefined_epi32(void)			_mm512_undefined_epi32(void)
	{			{
	return (__m512i)__builtin_ia32_undef512();			return (__m512i)__builtin_ia32_undef512();
	}			}

	static __inline__ __m512i __DEFAULT_FN_ATTRS512			static __inline__ __m512i __DEFAULT_FN_ATTRS512
	_mm512_broadcastd_epi32 (__m128i __A)			_mm512_broadcastd_epi32 (__m128i __A)
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name 'm128i'; did you mean 'm512i'? [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__m128i'; did you mean '__m512i'? [clang-diagnostic…
	{			{
	return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,			return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
				Lint: Pre-merge checks Inline Actions clang-tidy: error: use of undeclared identifier '__v4si' [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: use of undeclared identifier '__v4si' [clang-diagnostic-error] [[https…
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	}			}

	static __inline__ __m512i __DEFAULT_FN_ATTRS512			static __inline__ __m512i __DEFAULT_FN_ATTRS512
	_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)			_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name 'm128i'; did you mean 'm512i'? [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__m128i'; did you mean '__m512i'? [clang-diagnostic…
	{			{
	return (__m512i)__builtin_ia32_selectd_512(__M,			return (__m512i)__builtin_ia32_selectd_512(__M,
	(__v16si) _mm512_broadcastd_epi32(__A),			(__v16si) _mm512_broadcastd_epi32(__A),
	(__v16si) __O);			(__v16si) __O);
	}			}

	static __inline__ __m512i __DEFAULT_FN_ATTRS512			static __inline__ __m512i __DEFAULT_FN_ATTRS512
	_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)			_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name 'm128i'; did you mean 'm512i'? [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__m128i'; did you mean '__m512i'? [clang-diagnostic…
	{			{
	return (__m512i)__builtin_ia32_selectd_512(__M,			return (__m512i)__builtin_ia32_selectd_512(__M,
	(__v16si) _mm512_broadcastd_epi32(__A),			(__v16si) _mm512_broadcastd_epi32(__A),
	(__v16si) _mm512_setzero_si512());			(__v16si) _mm512_setzero_si512());
	}			}

	static __inline__ __m512i __DEFAULT_FN_ATTRS512			static __inline__ __m512i __DEFAULT_FN_ATTRS512
	_mm512_broadcastq_epi64 (__m128i __A)			_mm512_broadcastq_epi64 (__m128i __A)
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name 'm128i'; did you mean 'm512i'? [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__m128i'; did you mean '__m512i'? [clang-diagnostic…
	{			{
	return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,			return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name 'v2di'; did you mean 'v8di'? [clang-diagnostic-error] not useful clang-tidy: error: unknown type name 'v2di'; did you mean 'v8di'? [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__v2di'; did you mean '__v8di'? [clang-diagnostic-error]…
	0, 0, 0, 0, 0, 0, 0, 0);			0, 0, 0, 0, 0, 0, 0, 0);
	}			}

	static __inline__ __m512i __DEFAULT_FN_ATTRS512			static __inline__ __m512i __DEFAULT_FN_ATTRS512
	_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)			_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name 'm128i'; did you mean 'm512i'? [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__m128i'; did you mean '__m512i'? [clang-diagnostic…
	{			{
	return (__m512i)__builtin_ia32_selectq_512(__M,			return (__m512i)__builtin_ia32_selectq_512(__M,
	(__v8di) _mm512_broadcastq_epi64(__A),			(__v8di) _mm512_broadcastq_epi64(__A),
	(__v8di) __O);			(__v8di) __O);

	}			}

	static __inline__ __m512i __DEFAULT_FN_ATTRS512			static __inline__ __m512i __DEFAULT_FN_ATTRS512
	_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)			_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name 'm128i'; did you mean 'm512i'? [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__m128i'; did you mean '__m512i'? [clang-diagnostic…
	{			{
	return (__m512i)__builtin_ia32_selectq_512(__M,			return (__m512i)__builtin_ia32_selectq_512(__M,
	(__v8di) _mm512_broadcastq_epi64(__A),			(__v8di) _mm512_broadcastq_epi64(__A),
	(__v8di) _mm512_setzero_si512());			(__v8di) _mm512_setzero_si512());
	}			}


	static __inline __m512 __DEFAULT_FN_ATTRS512			static __inline __m512 __DEFAULT_FN_ATTRS512
	▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines
	_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)			_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
	{			{
	return (__m512i)__builtin_ia32_selectq_512(__M,			return (__m512i)__builtin_ia32_selectq_512(__M,
	(__v8di)_mm512_set1_epi64(__A),			(__v8di)_mm512_set1_epi64(__A),
	(__v8di)_mm512_setzero_si512());			(__v8di)_mm512_setzero_si512());
	}			}

	static __inline__ __m512 __DEFAULT_FN_ATTRS512			static __inline__ __m512 __DEFAULT_FN_ATTRS512
	_mm512_broadcastss_ps(__m128 __A)			_mm512_broadcastss_ps(__m128 __A)
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name '__m128' [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__m128' [clang-diagnostic-error] [[https://github.
	{			{
	return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,			return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
				Lint: Pre-merge checks Inline Actions clang-tidy: error: use of undeclared identifier '__v4sf' [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: use of undeclared identifier '__v4sf' [clang-diagnostic-error] [[https…
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	}			}

	static __inline __m512i __DEFAULT_FN_ATTRS512			static __inline __m512i __DEFAULT_FN_ATTRS512
	_mm512_set4_epi32 (int __A, int __B, int __C, int __D)			_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
	{			{
	return __extension__ (__m512i)(__v16si)			return __extension__ (__m512i)(__v16si)
	{ __D, __C, __B, __A, __D, __C, __B, __A,			{ __D, __C, __B, __A, __D, __C, __B, __A,
	Show All 31 Lines

	#define _mm512_setr4_pd(e0,e1,e2,e3) \			#define _mm512_setr4_pd(e0,e1,e2,e3) \
	_mm512_set4_pd((e3),(e2),(e1),(e0))			_mm512_set4_pd((e3),(e2),(e1),(e0))

	#define _mm512_setr4_ps(e0,e1,e2,e3) \			#define _mm512_setr4_ps(e0,e1,e2,e3) \
	_mm512_set4_ps((e3),(e2),(e1),(e0))			_mm512_set4_ps((e3),(e2),(e1),(e0))

	static __inline__ __m512d __DEFAULT_FN_ATTRS512			static __inline__ __m512d __DEFAULT_FN_ATTRS512
	_mm512_broadcastsd_pd(__m128d __A)			_mm512_broadcastsd_pd(__m128d __A)
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name 'm128d'; did you mean 'm512d'? [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__m128d'; did you mean '__m512d'? [clang-diagnostic…
	{			{
	return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,			return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name 'v2df'; did you mean 'v8df'? [clang-diagnostic-error] not useful clang-tidy: error: unknown type name 'v2df'; did you mean 'v8df'? [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__v2df'; did you mean '__v8df'? [clang-diagnostic-error]…
	0, 0, 0, 0, 0, 0, 0, 0);			0, 0, 0, 0, 0, 0, 0, 0);
	}			}

	/* Cast between vector types */			/* Cast between vector types */

	static __inline __m512d __DEFAULT_FN_ATTRS512			static __inline __m512d __DEFAULT_FN_ATTRS512
	_mm512_castpd256_pd512(__m256d __a)			_mm512_castpd256_pd512(__m256d __a)
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name '__m256d' [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__m256d' [clang-diagnostic-error] [[https://github.
	{			{
	return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);			return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
	}			}

	static __inline __m512 __DEFAULT_FN_ATTRS512			static __inline __m512 __DEFAULT_FN_ATTRS512
	_mm512_castps256_ps512(__m256 __a)			_mm512_castps256_ps512(__m256 __a)
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name '__m256' [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__m256' [clang-diagnostic-error] [[https://github.
	{			{
	return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,			return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,
	-1, -1, -1, -1, -1, -1, -1, -1);			-1, -1, -1, -1, -1, -1, -1, -1);
	}			}

	static __inline __m128d __DEFAULT_FN_ATTRS512			static __inline __m128d __DEFAULT_FN_ATTRS512
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name 'm128d'; did you mean 'm512d'? [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__m128d'; did you mean '__m512d'? [clang-diagnostic…
	_mm512_castpd512_pd128(__m512d __a)			_mm512_castpd512_pd128(__m512d __a)
	{			{
	return __builtin_shufflevector(__a, __a, 0, 1);			return __builtin_shufflevector(__a, __a, 0, 1);
				Lint: Pre-merge checks Inline Actions clang-tidy: error: cannot initialize return object of type 'm512d' (vector of 8 'double' values) with an rvalue of type 'attribute((vector_size__(2 * sizeof(double)))) double' (vector of 2 'double' values) [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: cannot initialize return object of type '__m512d' (vector of 8 'double'…
	}			}

	static __inline __m256d __DEFAULT_FN_ATTRS512			static __inline __m256d __DEFAULT_FN_ATTRS512
	_mm512_castpd512_pd256 (__m512d __A)			_mm512_castpd512_pd256 (__m512d __A)
	{			{
	return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);			return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
	}			}

	▲ Show 20 Lines • Show All 8,882 Lines • ▼ Show 20 Lines
	* computations. In vector-reduction arithmetic, the evaluation off is			* computations. In vector-reduction arithmetic, the evaluation off is
	* independent of the order of the input elements of V.			* independent of the order of the input elements of V.

	* Used bisection method. At each step, we partition the vector with previous			* Used bisection method. At each step, we partition the vector with previous
	* step in half, and the operation is performed on its two halves.			* step in half, and the operation is performed on its two halves.
	* This takes log2(n) steps where n is the number of elements in the vector.			* This takes log2(n) steps where n is the number of elements in the vector.
	*/			*/

	#define _mm512_mask_reduce_operator(op) \
	__v4du __t1 = (__v4du)_mm512_extracti64x4_epi64(__W, 0); \
	__v4du __t2 = (__v4du)_mm512_extracti64x4_epi64(__W, 1); \
	__m256i __t3 = (__m256i)(__t1 op __t2); \
	__v2du __t4 = (__v2du)_mm256_extracti128_si256(__t3, 0); \
	__v2du __t5 = (__v2du)_mm256_extracti128_si256(__t3, 1); \
	__v2du __t6 = __t4 op __t5; \
	__v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
	__v2du __t8 = __t6 op __t7; \
	return __t8[0]

	static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {			static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
	_mm512_mask_reduce_operator(+);			return __builtin_ia32_reduce_add_q512(__W);
	}			}

	static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {			static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
	_mm512_mask_reduce_operator(*);			return __builtin_ia32_reduce_mul_q512(__W);
	}			}

	static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {			static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
	_mm512_mask_reduce_operator(&);			return __builtin_ia32_reduce_and_q512(__W);
	}			}

	static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {			static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
	_mm512_mask_reduce_operator(\|);			return __builtin_ia32_reduce_or_q512(__W);
	}			}

	static __inline__ long long __DEFAULT_FN_ATTRS512			static __inline__ long long __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {			_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
	__W = _mm512_maskz_mov_epi64(__M, __W);			__W = _mm512_maskz_mov_epi64(__M, __W);
	_mm512_mask_reduce_operator(+);			return __builtin_ia32_reduce_add_q512(__W);
	}			}

	static __inline__ long long __DEFAULT_FN_ATTRS512			static __inline__ long long __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {			_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
	__W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);			__W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
	_mm512_mask_reduce_operator(*);			return __builtin_ia32_reduce_mul_q512(__W);
	}			}

	static __inline__ long long __DEFAULT_FN_ATTRS512			static __inline__ long long __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {			_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
	__W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);			__W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);
	_mm512_mask_reduce_operator(&);			return __builtin_ia32_reduce_and_q512(__W);
	}			}

	static __inline__ long long __DEFAULT_FN_ATTRS512			static __inline__ long long __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {			_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
	__W = _mm512_maskz_mov_epi64(__M, __W);			__W = _mm512_maskz_mov_epi64(__M, __W);
	_mm512_mask_reduce_operator(\|);			return __builtin_ia32_reduce_or_q512(__W);
	}			}
	#undef _mm512_mask_reduce_operator

	#define _mm512_mask_reduce_operator(op) \			#define _mm512_mask_reduce_operator(op) \
	__m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \			__m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \
	__m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \			__m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \
	__m256d __t3 = __t1 op __t2; \			__m256d __t3 = __t1 op __t2; \
	__m128d __t4 = _mm256_extractf128_pd(__t3, 0); \			__m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
	__m128d __t5 = _mm256_extractf128_pd(__t3, 1); \			__m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
	__m128d __t6 = __t4 op __t5; \			__m128d __t6 = __t4 op __t5; \
	Show All 17 Lines

	static __inline__ double __DEFAULT_FN_ATTRS512			static __inline__ double __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {			_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
	__W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);			__W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
	_mm512_mask_reduce_operator(*);			_mm512_mask_reduce_operator(*);
	}			}
	#undef _mm512_mask_reduce_operator			#undef _mm512_mask_reduce_operator

	#define _mm512_mask_reduce_operator(op) \
	__v8su __t1 = (__v8su)_mm512_extracti64x4_epi64(__W, 0); \
	__v8su __t2 = (__v8su)_mm512_extracti64x4_epi64(__W, 1); \
	__m256i __t3 = (__m256i)(__t1 op __t2); \
	__v4su __t4 = (__v4su)_mm256_extracti128_si256(__t3, 0); \
	__v4su __t5 = (__v4su)_mm256_extracti128_si256(__t3, 1); \
	__v4su __t6 = __t4 op __t5; \
	__v4su __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
	__v4su __t8 = __t6 op __t7; \
	__v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
	__v4su __t10 = __t8 op __t9; \
	return __t10[0]

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_reduce_add_epi32(__m512i __W) {			_mm512_reduce_add_epi32(__m512i __W) {
	_mm512_mask_reduce_operator(+);			return __builtin_ia32_reduce_add_d512((__v16si)__W);
	}			}

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_reduce_mul_epi32(__m512i __W) {			_mm512_reduce_mul_epi32(__m512i __W) {
	_mm512_mask_reduce_operator(*);			return __builtin_ia32_reduce_mul_d512((__v16si)__W);
	}			}

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_reduce_and_epi32(__m512i __W) {			_mm512_reduce_and_epi32(__m512i __W) {
	_mm512_mask_reduce_operator(&);			return __builtin_ia32_reduce_and_d512((__v16si)__W);
	}			}

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_reduce_or_epi32(__m512i __W) {			_mm512_reduce_or_epi32(__m512i __W) {
	_mm512_mask_reduce_operator(\|);			return __builtin_ia32_reduce_or_d512((__v16si)__W);
	}			}

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {			_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
	__W = _mm512_maskz_mov_epi32(__M, __W);			__W = _mm512_maskz_mov_epi32(__M, __W);
	_mm512_mask_reduce_operator(+);			return __builtin_ia32_reduce_add_d512((__v16si)__W);
	}			}

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {			_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
	__W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);			__W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
	_mm512_mask_reduce_operator(*);			return __builtin_ia32_reduce_mul_d512((__v16si)__W);
	}			}

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {			_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
	__W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);			__W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);
	_mm512_mask_reduce_operator(&);			return __builtin_ia32_reduce_and_d512((__v16si)__W);
	}			}

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {			_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
	__W = _mm512_maskz_mov_epi32(__M, __W);			__W = _mm512_maskz_mov_epi32(__M, __W);
	_mm512_mask_reduce_operator(\|);			return __builtin_ia32_reduce_or_d512((__v16si)__W);
	}			}
	#undef _mm512_mask_reduce_operator

	#define _mm512_mask_reduce_operator(op) \			#define _mm512_mask_reduce_operator(op) \
	__m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \			__m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \
	__m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \			__m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \
	__m256 __t3 = __t1 op __t2; \			__m256 __t3 = __t1 op __t2; \
	__m128 __t4 = _mm256_extractf128_ps(__t3, 0); \			__m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
	__m128 __t5 = _mm256_extractf128_ps(__t3, 1); \			__m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
	__m128 __t6 = __t4 op __t5; \			__m128 __t6 = __t4 op __t5; \
	Show All 21 Lines

	static __inline__ float __DEFAULT_FN_ATTRS512			static __inline__ float __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {			_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
	__W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);			__W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
	_mm512_mask_reduce_operator(*);			_mm512_mask_reduce_operator(*);
	}			}
	#undef _mm512_mask_reduce_operator			#undef _mm512_mask_reduce_operator

	#define _mm512_mask_reduce_operator(op) \
	__m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \
	__m512i __t2 = _mm512_##op(__V, __t1); \
	__m512i __t3 = (__m512i)__builtin_shufflevector((__v8di)__t2, (__v8di)__t2, 2, 3, 0, 1, 6, 7, 4, 5); \
	__m512i __t4 = _mm512_##op(__t2, __t3); \
	__m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \
	__v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \
	return __t6[0]

	static __inline__ long long __DEFAULT_FN_ATTRS512			static __inline__ long long __DEFAULT_FN_ATTRS512
	_mm512_reduce_max_epi64(__m512i __V) {			_mm512_reduce_max_epi64(__m512i __V) {
	_mm512_mask_reduce_operator(max_epi64);			return __builtin_ia32_reduce_smax_q512(__V);
	}			}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS512			static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
	_mm512_reduce_max_epu64(__m512i __V) {			_mm512_reduce_max_epu64(__m512i __V) {
	_mm512_mask_reduce_operator(max_epu64);			return __builtin_ia32_reduce_umax_q512(__V);
	}			}

	static __inline__ long long __DEFAULT_FN_ATTRS512			static __inline__ long long __DEFAULT_FN_ATTRS512
	_mm512_reduce_min_epi64(__m512i __V) {			_mm512_reduce_min_epi64(__m512i __V) {
	_mm512_mask_reduce_operator(min_epi64);			return __builtin_ia32_reduce_smin_q512(__V);
	}			}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS512			static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
	_mm512_reduce_min_epu64(__m512i __V) {			_mm512_reduce_min_epu64(__m512i __V) {
	_mm512_mask_reduce_operator(min_epu64);			return __builtin_ia32_reduce_umin_q512(__V);
	}			}

	static __inline__ long long __DEFAULT_FN_ATTRS512			static __inline__ long long __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {			_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
	__V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);			__V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
	_mm512_mask_reduce_operator(max_epi64);			return __builtin_ia32_reduce_smax_q512(__V);
	}			}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS512			static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {			_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
	__V = _mm512_maskz_mov_epi64(__M, __V);			__V = _mm512_maskz_mov_epi64(__M, __V);
	_mm512_mask_reduce_operator(max_epu64);			return __builtin_ia32_reduce_umax_q512(__V);
	}			}

	static __inline__ long long __DEFAULT_FN_ATTRS512			static __inline__ long long __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {			_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
	__V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);			__V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
	_mm512_mask_reduce_operator(min_epi64);			return __builtin_ia32_reduce_smin_q512(__V);
	}			}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS512			static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {			_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
	__V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);			__V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);
	_mm512_mask_reduce_operator(min_epu64);			return __builtin_ia32_reduce_umin_q512(__V);
	}			}
	#undef _mm512_mask_reduce_operator

	#define _mm512_mask_reduce_operator(op) \
	__m256i __t1 = _mm512_extracti64x4_epi64(__V, 0); \
	__m256i __t2 = _mm512_extracti64x4_epi64(__V, 1); \
	__m256i __t3 = _mm256_##op(__t1, __t2); \
	__m128i __t4 = _mm256_extracti128_si256(__t3, 0); \
	__m128i __t5 = _mm256_extracti128_si256(__t3, 1); \
	__m128i __t6 = _mm_##op(__t4, __t5); \
	__m128i __t7 = (__m128i)__builtin_shufflevector((__v4si)__t6, (__v4si)__t6, 2, 3, 0, 1); \
	__m128i __t8 = _mm_##op(__t6, __t7); \
	__m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \
	__v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \
	return __t10[0]

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_reduce_max_epi32(__m512i __V) {			_mm512_reduce_max_epi32(__m512i __V) {
	_mm512_mask_reduce_operator(max_epi32);			return __builtin_ia32_reduce_smax_d512((__v16si)__V);
	}			}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS512			static __inline__ unsigned int __DEFAULT_FN_ATTRS512
	_mm512_reduce_max_epu32(__m512i __V) {			_mm512_reduce_max_epu32(__m512i __V) {
	_mm512_mask_reduce_operator(max_epu32);			return __builtin_ia32_reduce_umax_d512((__v16si)__V);
	}			}

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_reduce_min_epi32(__m512i __V) {			_mm512_reduce_min_epi32(__m512i __V) {
	_mm512_mask_reduce_operator(min_epi32);			return __builtin_ia32_reduce_smin_d512((__v16si)__V);
	}			}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS512			static __inline__ unsigned int __DEFAULT_FN_ATTRS512
	_mm512_reduce_min_epu32(__m512i __V) {			_mm512_reduce_min_epu32(__m512i __V) {
	_mm512_mask_reduce_operator(min_epu32);			return __builtin_ia32_reduce_umin_d512((__v16si)__V);
	}			}

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {			_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
	__V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);			__V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
	_mm512_mask_reduce_operator(max_epi32);			return __builtin_ia32_reduce_smax_d512((__v16si)__V);
	}			}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS512			static __inline__ unsigned int __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {			_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
	__V = _mm512_maskz_mov_epi32(__M, __V);			__V = _mm512_maskz_mov_epi32(__M, __V);
	_mm512_mask_reduce_operator(max_epu32);			return __builtin_ia32_reduce_umax_d512((__v16si)__V);
	}			}

	static __inline__ int __DEFAULT_FN_ATTRS512			static __inline__ int __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {			_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
	__V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);			__V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
	_mm512_mask_reduce_operator(min_epi32);			return __builtin_ia32_reduce_smin_d512((__v16si)__V);
	}			}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS512			static __inline__ unsigned int __DEFAULT_FN_ATTRS512
	_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {			_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
	__V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);			__V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);
	_mm512_mask_reduce_operator(min_epu32);			return __builtin_ia32_reduce_umin_d512((__v16si)__V);
	}			}
	#undef _mm512_mask_reduce_operator

	#define _mm512_mask_reduce_operator(op) \			#define _mm512_mask_reduce_operator(op) \
	__m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \			__m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
	__m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \			__m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \
	__m256d __t3 = _mm256_##op(__t1, __t2); \			__m256d __t3 = _mm256_##op(__t1, __t2); \
	__m128d __t4 = _mm256_extractf128_pd(__t3, 0); \			__m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
	__m128d __t5 = _mm256_extractf128_pd(__t3, 1); \			__m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
	__m128d __t6 = _mm_##op(__t4, __t5); \			__m128d __t6 = _mm_##op(__t4, __t5); \
	▲ Show 20 Lines • Show All 85 Lines • Show Last 20 Lines

clang/test/CodeGen/X86/avx512-reduceIntrin.c

	// RUN: %clang_cc1 -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror \| FileCheck %s			// RUN: %clang_cc1 -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror \| FileCheck %s

	#include <immintrin.h>			#include <immintrin.h>

	long long test_mm512_reduce_add_epi64(__m512i __W){			long long test_mm512_reduce_add_epi64(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_add_epi64(			// CHECK-LABEL: @test_mm512_reduce_add_epi64(
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: add <4 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: add <2 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <2 x i64> %{{.}}, <2 x i64> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: add <2 x i64> %{{.}}, %{{.}}
	// CHECK: extractelement <2 x i64> %{{.*}}, i32 0
	return _mm512_reduce_add_epi64(__W);			return _mm512_reduce_add_epi64(__W);
	}			}

	long long test_mm512_reduce_mul_epi64(__m512i __W){			long long test_mm512_reduce_mul_epi64(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_mul_epi64(			// CHECK-LABEL: @test_mm512_reduce_mul_epi64(
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: mul <4 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: mul <2 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <2 x i64> %{{.}}, <2 x i64> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: mul <2 x i64> %{{.}}, %{{.}}
	// CHECK: extractelement <2 x i64> %{{.*}}, i32 0
	return _mm512_reduce_mul_epi64(__W);			return _mm512_reduce_mul_epi64(__W);
	}			}

	long long test_mm512_reduce_or_epi64(__m512i __W){			long long test_mm512_reduce_or_epi64(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_or_epi64(			// CHECK-LABEL: @test_mm512_reduce_or_epi64(
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: or <4 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: or <2 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <2 x i64> %{{.}}, <2 x i64> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: or <2 x i64> %{{.}}, %{{.}}
	// CHECK: extractelement <2 x i64> %{{.*}}, i32 0
	return _mm512_reduce_or_epi64(__W);			return _mm512_reduce_or_epi64(__W);
	}			}

	long long test_mm512_reduce_and_epi64(__m512i __W){			long long test_mm512_reduce_and_epi64(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_and_epi64(			// CHECK-LABEL: @test_mm512_reduce_and_epi64(
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: and <4 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: and <2 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <2 x i64> %{{.}}, <2 x i64> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: and <2 x i64> %{{.}}, %{{.}}
	// CHECK: extractelement <2 x i64> %{{.*}}, i32 0
	return _mm512_reduce_and_epi64(__W);			return _mm512_reduce_and_epi64(__W);
	}			}

	long long test_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W){			long long test_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_add_epi64(			// CHECK-LABEL: @test_mm512_mask_reduce_add_epi64(
	// CHECK: bitcast i8 %{{.*}} to <8 x i1>			// CHECK: bitcast i8 %{{.*}} to <8 x i1>
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}			// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: add <4 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: add <2 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <2 x i64> %{{.}}, <2 x i64> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: add <2 x i64> %{{.}}, %{{.}}
	// CHECK: extractelement <2 x i64> %{{.*}}, i32 0
	return _mm512_mask_reduce_add_epi64(__M, __W);			return _mm512_mask_reduce_add_epi64(__M, __W);
	}			}

	long long test_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W){			long long test_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_mul_epi64(			// CHECK-LABEL: @test_mm512_mask_reduce_mul_epi64(
	// CHECK: bitcast i8 %{{.*}} to <8 x i1>			// CHECK: bitcast i8 %{{.*}} to <8 x i1>
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}			// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: mul <4 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: mul <2 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <2 x i64> %{{.}}, <2 x i64> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: mul <2 x i64> %{{.}}, %{{.}}
	// CHECK: extractelement <2 x i64> %{{.*}}, i32 0
	return _mm512_mask_reduce_mul_epi64(__M, __W);			return _mm512_mask_reduce_mul_epi64(__M, __W);
	}			}

	long long test_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W){			long long test_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_and_epi64(			// CHECK-LABEL: @test_mm512_mask_reduce_and_epi64(
	// CHECK: bitcast i8 %{{.*}} to <8 x i1>			// CHECK: bitcast i8 %{{.*}} to <8 x i1>
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}			// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: and <4 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: and <2 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <2 x i64> %{{.}}, <2 x i64> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: and <2 x i64> %{{.}}, %{{.}}
	// CHECK: extractelement <2 x i64> %{{.*}}, i32 0
	return _mm512_mask_reduce_and_epi64(__M, __W);			return _mm512_mask_reduce_and_epi64(__M, __W);
	}			}

	long long test_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W){			long long test_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_or_epi64(			// CHECK-LABEL: @test_mm512_mask_reduce_or_epi64(
	// CHECK: bitcast i8 %{{.*}} to <8 x i1>			// CHECK: bitcast i8 %{{.*}} to <8 x i1>
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}			// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: or <4 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: or <2 x i64> %{{.}}, %{{.}}
	// CHECK: shufflevector <2 x i64> %{{.}}, <2 x i64> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: or <2 x i64> %{{.}}, %{{.}}
	// CHECK: extractelement <2 x i64> %{{.*}}, i32 0
	return _mm512_mask_reduce_or_epi64(__M, __W);			return _mm512_mask_reduce_or_epi64(__M, __W);
	}			}

	int test_mm512_reduce_add_epi32(__m512i __W){			int test_mm512_reduce_add_epi32(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_add_epi32(			// CHECK-LABEL: @test_mm512_reduce_add_epi32(
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: add <8 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: add <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: add <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: add <4 x i32> %{{.}}, %{{.}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_reduce_add_epi32(__W);			return _mm512_reduce_add_epi32(__W);
	}			}

	int test_mm512_reduce_mul_epi32(__m512i __W){			int test_mm512_reduce_mul_epi32(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_mul_epi32(			// CHECK-LABEL: @test_mm512_reduce_mul_epi32(
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: mul <8 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: mul <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: mul <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: mul <4 x i32> %{{.}}, %{{.}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_reduce_mul_epi32(__W);			return _mm512_reduce_mul_epi32(__W);
	}			}

	int test_mm512_reduce_or_epi32(__m512i __W){			int test_mm512_reduce_or_epi32(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_or_epi32(			// CHECK: call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: or <8 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: or <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: or <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: or <4 x i32> %{{.}}, %{{.}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_reduce_or_epi32(__W);			return _mm512_reduce_or_epi32(__W);
	}			}

	int test_mm512_reduce_and_epi32(__m512i __W){			int test_mm512_reduce_and_epi32(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_and_epi32(			// CHECK-LABEL: @test_mm512_reduce_and_epi32(
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: and <8 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: and <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: and <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: and <4 x i32> %{{.}}, %{{.}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_reduce_and_epi32(__W);			return _mm512_reduce_and_epi32(__W);
	}			}

	int test_mm512_mask_reduce_add_epi32(__mmask16 __M, __m512i __W){			int test_mm512_mask_reduce_add_epi32(__mmask16 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_add_epi32(			// CHECK-LABEL: @test_mm512_mask_reduce_add_epi32(
	// CHECK: bitcast i16 %{{.*}} to <16 x i1>			// CHECK: bitcast i16 %{{.*}} to <16 x i1>
	// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}			// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}
	// CHECK: bitcast <16 x i32> %{{.*}} to <8 x i64>			// CHECK: call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: add <8 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: add <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: add <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: add <4 x i32> %{{.}}, %{{.}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_mask_reduce_add_epi32(__M, __W);			return _mm512_mask_reduce_add_epi32(__M, __W);
	}			}

	int test_mm512_mask_reduce_mul_epi32(__mmask16 __M, __m512i __W){			int test_mm512_mask_reduce_mul_epi32(__mmask16 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_mul_epi32(			// CHECK-LABEL: @test_mm512_mask_reduce_mul_epi32(
	// CHECK: bitcast i16 %{{.*}} to <16 x i1>			// CHECK: bitcast i16 %{{.*}} to <16 x i1>
	// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}			// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}
	// CHECK: bitcast <16 x i32> %{{.*}} to <8 x i64>			// CHECK: call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: mul <8 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: mul <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: mul <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: mul <4 x i32> %{{.}}, %{{.}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_mask_reduce_mul_epi32(__M, __W);			return _mm512_mask_reduce_mul_epi32(__M, __W);
	}			}

	int test_mm512_mask_reduce_and_epi32(__mmask16 __M, __m512i __W){			int test_mm512_mask_reduce_and_epi32(__mmask16 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_and_epi32(			// CHECK-LABEL: @test_mm512_mask_reduce_and_epi32(
	// CHECK: bitcast i16 %{{.*}} to <16 x i1>			// CHECK: bitcast i16 %{{.*}} to <16 x i1>
	// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}			// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}
	// CHECK: bitcast <16 x i32> %{{.*}} to <8 x i64>			// CHECK: call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: and <8 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: and <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: and <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: and <4 x i32> %{{.}}, %{{.}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_mask_reduce_and_epi32(__M, __W);			return _mm512_mask_reduce_and_epi32(__M, __W);
	}			}

	int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){			int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_or_epi32(			// CHECK-LABEL: @test_mm512_mask_reduce_or_epi32(
	// CHECK: bitcast i16 %{{.*}} to <16 x i1>			// CHECK: bitcast i16 %{{.*}} to <16 x i1>
	// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}			// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}
	// CHECK: bitcast <16 x i32> %{{.*}} to <8 x i64>			// CHECK: call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: or <8 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: or <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: or <4 x i32> %{{.}}, %{{.}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: or <4 x i32> %{{.}}, %{{.}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_mask_reduce_or_epi32(__M, __W);			return _mm512_mask_reduce_or_epi32(__M, __W);
	}			}

	double test_mm512_reduce_add_pd(__m512d __W){			double test_mm512_reduce_add_pd(__m512d __W){
	// CHECK-LABEL: @test_mm512_reduce_add_pd(			// CHECK-LABEL: @test_mm512_reduce_add_pd(
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: fadd <4 x double> %{{.}}, %{{.}}			// CHECK: fadd <4 x double> %{{.}}, %{{.}}
	▲ Show 20 Lines • Show All 135 Lines • Show Last 20 Lines

clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c

	// RUN: %clang_cc1 -fexperimental-new-pass-manager -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror \| FileCheck %s			// RUN: %clang_cc1 -fexperimental-new-pass-manager -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror \| FileCheck %s

	#include <immintrin.h>			#include <immintrin.h>

	long long test_mm512_reduce_max_epi64(__m512i __W){			long long test_mm512_reduce_max_epi64(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_max_epi64(			// CHECK-LABEL: @test_mm512_reduce_max_epi64(
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> %{{.*}})
	// CHECK: icmp sgt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
	// CHECK: icmp sgt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
	// CHECK: icmp sgt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	return _mm512_reduce_max_epi64(__W);			return _mm512_reduce_max_epi64(__W);
	}			}

	unsigned long long test_mm512_reduce_max_epu64(__m512i __W){			unsigned long long test_mm512_reduce_max_epu64(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_max_epu64(			// CHECK-LABEL: @test_mm512_reduce_max_epu64(
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> %{{.*}})
	// CHECK: icmp ugt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
	// CHECK: icmp ugt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
	// CHECK: icmp ugt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: extractelement <8 x i64> %{{.*}}, i32 0
	return _mm512_reduce_max_epu64(__W);			return _mm512_reduce_max_epu64(__W);
	}			}

	double test_mm512_reduce_max_pd(__m512d __W){			double test_mm512_reduce_max_pd(__m512d __W){
	// CHECK-LABEL: @test_mm512_reduce_max_pd(			// CHECK-LABEL: @test_mm512_reduce_max_pd(
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.}}, <4 x double> %{{.}})			// CHECK: call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.}}, <4 x double> %{{.}})
	// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>			// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>			// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.}}, <2 x double> %{{.}})			// CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.}}, <2 x double> %{{.}})
	// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <2 x i32> <i32 1, i32 0>			// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.}}, <2 x double> %{{.}})			// CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.}}, <2 x double> %{{.}})
	// CHECK: extractelement <2 x double> %{{.*}}, i32 0			// CHECK: extractelement <2 x double> %{{.*}}, i32 0
	return _mm512_reduce_max_pd(__W);			return _mm512_reduce_max_pd(__W);
	}			}

	long long test_mm512_reduce_min_epi64(__m512i __W){			long long test_mm512_reduce_min_epi64(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_min_epi64(			// CHECK-LABEL: @test_mm512_reduce_min_epi64(
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> %{{.*}})
	// CHECK: icmp slt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
	// CHECK: icmp slt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
	// CHECK: icmp slt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	return _mm512_reduce_min_epi64(__W);			return _mm512_reduce_min_epi64(__W);
	}			}

	unsigned long long test_mm512_reduce_min_epu64(__m512i __W){			unsigned long long test_mm512_reduce_min_epu64(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_min_epu64(			// CHECK-LABEL: @test_mm512_reduce_min_epu64(
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> %{{.*}})
	// CHECK: icmp ult <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
	// CHECK: icmp ult <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
	// CHECK: icmp ult <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: extractelement <8 x i64> %{{.*}}, i32 0
	return _mm512_reduce_min_epu64(__W);			return _mm512_reduce_min_epu64(__W);
	}			}

	double test_mm512_reduce_min_pd(__m512d __W){			double test_mm512_reduce_min_pd(__m512d __W){
	// CHECK-LABEL: @test_mm512_reduce_min_pd(			// CHECK-LABEL: @test_mm512_reduce_min_pd(
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.}}, <4 x double> %{{.}})			// CHECK: call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.}}, <4 x double> %{{.}})
	// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>			// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>			// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.}}, <2 x double> %{{.}})			// CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.}}, <2 x double> %{{.}})
	// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <2 x i32> <i32 1, i32 0>			// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.}}, <2 x double> %{{.}})			// CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.}}, <2 x double> %{{.}})
	// CHECK: extractelement <2 x double> %{{.*}}, i32 0			// CHECK: extractelement <2 x double> %{{.*}}, i32 0
	return _mm512_reduce_min_pd(__W);			return _mm512_reduce_min_pd(__W);
	}			}

	long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){			long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_max_epi64(			// CHECK-LABEL: @test_mm512_mask_reduce_max_epi64(
	// CHECK: bitcast i8 %{{.*}} to <8 x i1>			// CHECK: bitcast i8 %{{.*}} to <8 x i1>
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}			// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> %{{.*}})
	// CHECK: icmp sgt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
	// CHECK: icmp sgt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
	// CHECK: icmp sgt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	return _mm512_mask_reduce_max_epi64(__M, __W);			return _mm512_mask_reduce_max_epi64(__M, __W);
	}			}

	unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){			unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_max_epu64(			// CHECK-LABEL: @test_mm512_mask_reduce_max_epu64(
	// CHECK: bitcast i8 %{{.*}} to <8 x i1>			// CHECK: bitcast i8 %{{.*}} to <8 x i1>
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}			// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> %{{.*}})
	// CHECK: icmp ugt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
	// CHECK: icmp ugt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
	// CHECK: icmp ugt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	return _mm512_mask_reduce_max_epu64(__M, __W);			return _mm512_mask_reduce_max_epu64(__M, __W);
	}			}

	double test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){			double test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_max_pd(			// CHECK-LABEL: @test_mm512_mask_reduce_max_pd(
	// CHECK: bitcast i8 %{{.*}} to <8 x i1>			// CHECK: bitcast i8 %{{.*}} to <8 x i1>
	// CHECK: select <8 x i1> %{{.}}, <8 x double> %{{.}}, <8 x double> %{{.*}}			// CHECK: select <8 x i1> %{{.}}, <8 x double> %{{.}}, <8 x double> %{{.*}}
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.}}, <4 x double> %{{.}})			// CHECK: call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.}}, <4 x double> %{{.}})
	// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>			// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>			// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.}}, <2 x double> %{{.}})			// CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.}}, <2 x double> %{{.}})
	// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <2 x i32> <i32 1, i32 0>			// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.}}, <2 x double> %{{.}})			// CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.}}, <2 x double> %{{.}})
	// CHECK: extractelement <2 x double> %{{.*}}, i32 0			// CHECK: extractelement <2 x double> %{{.*}}, i32 0
	return _mm512_mask_reduce_max_pd(__M, __W);			return _mm512_mask_reduce_max_pd(__M, __W);
	}			}

	long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){			long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_min_epi64(			// CHECK-LABEL: @test_mm512_mask_reduce_min_epi64(
	// CHECK: bitcast i8 %{{.*}} to <8 x i1>			// CHECK: bitcast i8 %{{.*}} to <8 x i1>
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}			// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> %{{.*}})
	// CHECK: icmp slt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
	// CHECK: icmp slt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
	// CHECK: icmp slt <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	return _mm512_mask_reduce_min_epi64(__M, __W);			return _mm512_mask_reduce_min_epi64(__M, __W);
	}			}

	unsigned long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){			unsigned long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_min_epu64(			// CHECK-LABEL: @test_mm512_mask_reduce_min_epu64(
	// CHECK: bitcast i8 %{{.*}} to <8 x i1>			// CHECK: bitcast i8 %{{.*}} to <8 x i1>
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}			// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>			// CHECK: call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> %{{.*}})
	// CHECK: icmp ult <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
	// CHECK: icmp ult <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
	// CHECK: icmp ult <8 x i64> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i64> %{{.}}, <8 x i64> %{{.*}}
	return _mm512_mask_reduce_min_epu64(__M, __W);			return _mm512_mask_reduce_min_epu64(__M, __W);
	}			}

	double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){			double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_min_pd(			// CHECK-LABEL: @test_mm512_mask_reduce_min_pd(
	// CHECK: bitcast i8 %{{.*}} to <8 x i1>			// CHECK: bitcast i8 %{{.*}} to <8 x i1>
	// CHECK: select <8 x i1> %{{.}}, <8 x double> %{{.}}, <8 x double> %{{.*}}			// CHECK: select <8 x i1> %{{.}}, <8 x double> %{{.}}, <8 x double> %{{.*}}
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.}}, <4 x double> %{{.}})			// CHECK: call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.}}, <4 x double> %{{.}})
	// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>			// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>			// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.}}, <2 x double> %{{.}})			// CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.}}, <2 x double> %{{.}})
	// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <2 x i32> <i32 1, i32 0>			// CHECK: shufflevector <2 x double> %{{.}}, <2 x double> %{{.}}, <2 x i32> <i32 1, i32 0>
	// CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.}}, <2 x double> %{{.}})			// CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.}}, <2 x double> %{{.}})
	// CHECK: extractelement <2 x double> %{{.*}}, i32 0			// CHECK: extractelement <2 x double> %{{.*}}, i32 0
	return _mm512_mask_reduce_min_pd(__M, __W);			return _mm512_mask_reduce_min_pd(__M, __W);
	}			}

	int test_mm512_reduce_max_epi32(__m512i __W){			int test_mm512_reduce_max_epi32(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_max_epi32(			// CHECK-LABEL: @test_mm512_reduce_max_epi32(
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: icmp sgt <8 x i32> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i32> %{{.}}, <8 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: icmp sgt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: icmp sgt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: icmp sgt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_reduce_max_epi32(__W);			return _mm512_reduce_max_epi32(__W);
	}			}

	unsigned int test_mm512_reduce_max_epu32(__m512i __W){			unsigned int test_mm512_reduce_max_epu32(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_max_epu32(			// CHECK-LABEL: @test_mm512_reduce_max_epu32(
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: icmp ugt <8 x i32> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i32> %{{.}}, <8 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: icmp ugt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: icmp ugt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: icmp ugt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_reduce_max_epu32(__W);			return _mm512_reduce_max_epu32(__W);
	}			}

	float test_mm512_reduce_max_ps(__m512 __W){			float test_mm512_reduce_max_ps(__m512 __W){
	// CHECK-LABEL: define float @test_mm512_reduce_max_ps(			// CHECK-LABEL: define float @test_mm512_reduce_max_ps(
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %{{.}}, <8 x float> %{{.}})			// CHECK: call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %{{.}}, <8 x float> %{{.}})
	// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>			// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.}}, <4 x float> %{{.}})			// CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.}}, <4 x float> %{{.}})
	// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>			// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.}}, <4 x float> %{{.}})			// CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.}}, <4 x float> %{{.}})
	// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>			// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.}}, <4 x float> %{{.}})			// CHECK: call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.}}, <4 x float> %{{.}})
	// CHECK: extractelement <4 x float> %{{.*}}, i32 0			// CHECK: extractelement <4 x float> %{{.*}}, i32 0
	return _mm512_reduce_max_ps(__W);			return _mm512_reduce_max_ps(__W);
	}			}

	int test_mm512_reduce_min_epi32(__m512i __W){			int test_mm512_reduce_min_epi32(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_min_epi32(			// CHECK-LABEL: @test_mm512_reduce_min_epi32(
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: icmp slt <8 x i32> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i32> %{{.}}, <8 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: icmp slt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: icmp slt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: icmp slt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_reduce_min_epi32(__W);			return _mm512_reduce_min_epi32(__W);
	}			}

	unsigned int test_mm512_reduce_min_epu32(__m512i __W){			unsigned int test_mm512_reduce_min_epu32(__m512i __W){
	// CHECK-LABEL: @test_mm512_reduce_min_epu32(			// CHECK-LABEL: @test_mm512_reduce_min_epu32(
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: icmp ult <8 x i32> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i32> %{{.}}, <8 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: icmp ult <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: icmp ult <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: icmp ult <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_reduce_min_epu32(__W);			return _mm512_reduce_min_epu32(__W);
	}			}

	float test_mm512_reduce_min_ps(__m512 __W){			float test_mm512_reduce_min_ps(__m512 __W){
	// CHECK-LABEL: define float @test_mm512_reduce_min_ps(			// CHECK-LABEL: define float @test_mm512_reduce_min_ps(
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %{{.}}, <8 x float> %{{.}})			// CHECK: call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %{{.}}, <8 x float> %{{.}})
	// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>			// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.}}, <4 x float> %{{.}})			// CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.}}, <4 x float> %{{.}})
	// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>			// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.}}, <4 x float> %{{.}})			// CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.}}, <4 x float> %{{.}})
	// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>			// CHECK: shufflevector <4 x float> %{{.}}, <4 x float> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.}}, <4 x float> %{{.}})			// CHECK: call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.}}, <4 x float> %{{.}})
	// CHECK: extractelement <4 x float> %{{.*}}, i32 0			// CHECK: extractelement <4 x float> %{{.*}}, i32 0
	return _mm512_reduce_min_ps(__W);			return _mm512_reduce_min_ps(__W);
	}			}

	int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){			int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_max_epi32(			// CHECK-LABEL: @test_mm512_mask_reduce_max_epi32(
	// CHECK: bitcast i16 %{{.*}} to <16 x i1>			// CHECK: bitcast i16 %{{.*}} to <16 x i1>
	// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}			// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: icmp sgt <8 x i32> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i32> %{{.}}, <8 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: icmp sgt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: icmp sgt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: icmp sgt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_mask_reduce_max_epi32(__M, __W);			return _mm512_mask_reduce_max_epi32(__M, __W);
	}			}

	unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){			unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_max_epu32(			// CHECK-LABEL: @test_mm512_mask_reduce_max_epu32(
	// CHECK: bitcast i16 %{{.*}} to <16 x i1>			// CHECK: bitcast i16 %{{.*}} to <16 x i1>
	// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}			// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: icmp ugt <8 x i32> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i32> %{{.}}, <8 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: icmp ugt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: icmp ugt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: icmp ugt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_mask_reduce_max_epu32(__M, __W);			return _mm512_mask_reduce_max_epu32(__M, __W);
	}			}

	float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){			float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){
	// CHECK-LABEL: define float @test_mm512_mask_reduce_max_ps(			// CHECK-LABEL: define float @test_mm512_mask_reduce_max_ps(
	// CHECK: bitcast i16 %{{.*}} to <16 x i1>			// CHECK: bitcast i16 %{{.*}} to <16 x i1>
	// CHECK: select <16 x i1> %{{.}}, <16 x float> %{{.}}, <16 x float> %{{.*}}			// CHECK: select <16 x i1> %{{.}}, <16 x float> %{{.}}, <16 x float> %{{.*}}
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	Show All 9 Lines
	// CHECK: extractelement <4 x float> %{{.*}}, i32 0			// CHECK: extractelement <4 x float> %{{.*}}, i32 0
	return _mm512_mask_reduce_max_ps(__M, __W);			return _mm512_mask_reduce_max_ps(__M, __W);
	}			}

	int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){			int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_min_epi32(			// CHECK-LABEL: @test_mm512_mask_reduce_min_epi32(
	// CHECK: bitcast i16 %{{.*}} to <16 x i1>			// CHECK: bitcast i16 %{{.*}} to <16 x i1>
	// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}			// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: icmp slt <8 x i32> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i32> %{{.}}, <8 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: icmp slt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: icmp slt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: icmp slt <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_mask_reduce_min_epi32(__M, __W);			return _mm512_mask_reduce_min_epi32(__M, __W);
	}			}

	unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){			unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){
	// CHECK-LABEL: @test_mm512_mask_reduce_min_epu32(			// CHECK-LABEL: @test_mm512_mask_reduce_min_epu32(
	// CHECK: bitcast i16 %{{.*}} to <16 x i1>			// CHECK: bitcast i16 %{{.*}} to <16 x i1>
	// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}			// CHECK: select <16 x i1> %{{.}}, <16 x i32> %{{.}}, <16 x i32> %{{.*}}
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %{{.*}})
	// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
	// CHECK: icmp ult <8 x i32> %{{.}}, %{{.}}
	// CHECK: select <8 x i1> %{{.}}, <8 x i32> %{{.}}, <8 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
	// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
	// CHECK: icmp ult <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	// CHECK: icmp ult <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: shufflevector <4 x i32> %{{.}}, <4 x i32> %{{.}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
	// CHECK: icmp ult <4 x i32> %{{.}}, %{{.}}
	// CHECK: select <4 x i1> %{{.}}, <4 x i32> %{{.}}, <4 x i32> %{{.*}}
	// CHECK: extractelement <4 x i32> %{{.*}}, i32 0
	return _mm512_mask_reduce_min_epu32(__M, __W);			return _mm512_mask_reduce_min_epu32(__M, __W);
	}			}

	float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){			float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){
	// CHECK-LABEL: define float @test_mm512_mask_reduce_min_ps(			// CHECK-LABEL: define float @test_mm512_mask_reduce_min_ps(
	// CHECK: bitcast i16 %{{.*}} to <16 x i1>			// CHECK: bitcast i16 %{{.*}} to <16 x i1>
	// CHECK: select <16 x i1> %{{.}}, <16 x float> %{{.}}, <16 x float> %{{.*}}			// CHECK: select <16 x i1> %{{.}}, <16 x float> %{{.}}, <16 x float> %{{.*}}
	// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>			// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	Show All 13 Lines