Diff 81744

include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 578 Lines • ▼ Show 20 Lines	public:
/// at every reduction level.		/// at every reduction level.
///		///
/// Pairwise:		/// Pairwise:
/// (v0, v1, v2, v3)		/// (v0, v1, v2, v3)
/// ((v0+v1), (v2, v3), undef, undef)		/// ((v0+v1), (v2, v3), undef, undef)
/// Split:		/// Split:
/// (v0, v1, v2, v3)		/// (v0, v1, v2, v3)
/// ((v0+v2), (v1+v3), undef, undef)		/// ((v0+v2), (v1+v3), undef, undef)
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const;		int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
		bool IsPairwiseForm) const;
		int getMinMaxReductionCost(Type Ty, Type CondTy, bool IsPairwiseForm) const;

/// \returns The cost of Intrinsic instructions. Types analysis only.		/// \returns The cost of Intrinsic instructions. Types analysis only.
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,		int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Type *> Tys, FastMathFlags FMF) const;		ArrayRef<Type *> Tys, FastMathFlags FMF) const;

/// \returns The cost of Intrinsic instructions. Analyses the real arguments.		/// \returns The cost of Intrinsic instructions. Analyses the real arguments.
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,		int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Value *> Args, FastMathFlags FMF) const;		ArrayRef<Value *> Args, FastMathFlags FMF) const;
▲ Show 20 Lines • Show All 175 Lines • ▼ Show 20 Lines	public:
virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,		virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
Value *Ptr, bool VariableMask,		Value *Ptr, bool VariableMask,
unsigned Alignment) = 0;		unsigned Alignment) = 0;
virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,		virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,		unsigned Factor,
ArrayRef<unsigned> Indices,		ArrayRef<unsigned> Indices,
unsigned Alignment,		unsigned Alignment,
unsigned AddressSpace) = 0;		unsigned AddressSpace) = 0;
virtual int getReductionCost(unsigned Opcode, Type *Ty,		virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
		bool IsPairwiseForm) = 0;
		virtual int getMinMaxReductionCost(Type Ty, Type CondTy,
bool IsPairwiseForm) = 0;		bool IsPairwiseForm) = 0;
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,		virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Type *> Tys,		ArrayRef<Type *> Tys,
FastMathFlags FMF) = 0;		FastMathFlags FMF) = 0;
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,		virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Value *> Args,		ArrayRef<Value *> Args,
FastMathFlags FMF) = 0;		FastMathFlags FMF) = 0;
virtual int getCallInstrCost(Function F, Type RetTy,		virtual int getCallInstrCost(Function F, Type RetTy,
ArrayRef<Type *> Tys) = 0;		ArrayRef<Type *> Tys) = 0;
▲ Show 20 Lines • Show All 222 Lines • ▼ Show 20 Lines	return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
Alignment);		Alignment);
}		}
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,		int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,		ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace) override {		unsigned AddressSpace) override {
return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,		return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);		Alignment, AddressSpace);
}		}
int getReductionCost(unsigned Opcode, Type *Ty,		int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
		bool IsPairwiseForm) override {
		return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm);
		}
		int getMinMaxReductionCost(Type Ty, Type CondTy,
bool IsPairwiseForm) override {		bool IsPairwiseForm) override {
return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);		return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm);
}		}
int getIntrinsicInstrCost(Intrinsic::ID ID, Type RetTy, ArrayRef<Type > Tys,		int getIntrinsicInstrCost(Intrinsic::ID ID, Type RetTy, ArrayRef<Type > Tys,
FastMathFlags FMF) override {		FastMathFlags FMF) override {
return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF);		return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
}		}
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,		int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Value *> Args,		ArrayRef<Value *> Args,
FastMathFlags FMF) override {		FastMathFlags FMF) override {
▲ Show 20 Lines • Show All 160 Lines • Show Last 20 Lines

include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 366 Lines • ▼ Show 20 Lines	public:
unsigned getCallInstrCost(Function F, Type RetTy, ArrayRef<Type *> Tys) {		unsigned getCallInstrCost(Function F, Type RetTy, ArrayRef<Type *> Tys) {
return 1;		return 1;
}		}

unsigned getNumberOfParts(Type *Tp) { return 0; }		unsigned getNumberOfParts(Type *Tp) { return 0; }

unsigned getAddressComputationCost(Type *Tp, bool) { return 0; }		unsigned getAddressComputationCost(Type *Tp, bool) { return 0; }

unsigned getReductionCost(unsigned, Type *, bool) { return 1; }		unsigned getArithmeticReductionCost(unsigned, Type *, bool) { return 1; }

		unsigned getMinMaxReductionCost(Type , Type , bool) { return 1; }

unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { return 0; }		unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { return 0; }

bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) {		bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) {
return false;		return false;
}		}

Value getOrCreateResultFromMemIntrinsic(IntrinsicInst Inst,		Value getOrCreateResultFromMemIntrinsic(IntrinsicInst Inst,
▲ Show 20 Lines • Show All 193 Lines • Show Last 20 Lines

include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 919 Lines • ▼ Show 20 Lines	public:

unsigned getNumberOfParts(Type *Tp) {		unsigned getNumberOfParts(Type *Tp) {
std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp);		std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp);
return LT.first;		return LT.first;
}		}

unsigned getAddressComputationCost(Type *Ty, bool IsComplex) { return 0; }		unsigned getAddressComputationCost(Type *Ty, bool IsComplex) { return 0; }

unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) {		unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty,
		bool IsPairwise) {
assert(Ty->isVectorTy() && "Expect a vector type");		assert(Ty->isVectorTy() && "Expect a vector type");
Type *ScalarTy = Ty->getVectorElementType();		Type *ScalarTy = Ty->getVectorElementType();
unsigned NumVecElts = Ty->getVectorNumElements();		unsigned NumVecElts = Ty->getVectorNumElements();
unsigned NumReduxLevels = Log2_32(NumVecElts);		unsigned NumReduxLevels = Log2_32(NumVecElts);
// Try to calculate arithmetic and shuffle op costs for reduction operations.		// Try to calculate arithmetic and shuffle op costs for reduction operations.
// We're assuming that reduction operation are performing the following way:		// We're assuming that reduction operation are performing the following way:
// 1. Non-pairwise reduction		// 1. Non-pairwise reduction
// %val1 = shufflevector<n x t> %val, <n x t> %undef,		// %val1 = shufflevector<n x t> %val, <n x t> %undef,
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty,
ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *		ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,		ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
NumVecElts, Ty);		NumVecElts, Ty);
ArithCost += (NumReduxLevels - LongVectorCount) *		ArithCost += (NumReduxLevels - LongVectorCount) *
ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);		ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);		return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
}		}

		unsigned getMinMaxReductionCost(Type Ty, Type CondTy, bool IsPairwise) {
		assert(Ty->isVectorTy() && "Expect a vector type");
		Type *ScalarTy = Ty->getVectorElementType();
		Type *ScalarCondTy = CondTy->getVectorElementType();
		unsigned NumVecElts = Ty->getVectorNumElements();
		unsigned NumReduxLevels = Log2_32(NumVecElts);
		unsigned CmpOpcode;
		if (Ty->getVectorElementType()->isFloatingPointTy())
		CmpOpcode = Instruction::FCmp;
		else {
		RKSimonUnsubmitted Not Done Reply Inline Actions ScalarTy->isFloatingPointTy() RKSimon: ScalarTy->isFloatingPointTy()
		assert(Ty->isIntOrIntVectorTy());
		RKSimonUnsubmitted Done Reply Inline Actions Missing assert message RKSimon: Missing assert message
		CmpOpcode = Instruction::ICmp;
		}
		// Try to calculate arithmetic and shuffle op costs for reduction operations.
		RKSimonUnsubmitted Done Reply Inline Actions Move this comment out and just above the arithmetic/minmax functions? RKSimon: Move this comment out and just above the arithmetic/minmax functions?
		// We're assuming that reduction operation are performing the following way:
		// 1. Non-pairwise reduction
		// %val1 = shufflevector<n x t> %val, <n x t> %undef,
		// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
		// \----------------v-------------/ \----------v------------/
		// n/2 elements n/2 elements
		// %red1 = op <n x t> %val, <n x t> val1
		// After this operation we have a vector %red1 with only maningfull the
		RKSimonUnsubmitted Done Reply Inline Actions where only the first n/2 elements are meaningful, RKSimon: where only the first n/2 elements are meaningful,
		// first n/2 elements, the second n/2 elements are undefined and can be
		// dropped. All other operations are actually working with the vector of
		// length n/2, not n. though the real vector length is still n.
		RKSimonUnsubmitted Done Reply Inline Actions , not n, RKSimon: , not n,
		// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
		// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
		// \----------------v-------------/ \----------v------------/
		// n/4 elements 3*n/4 elements
		// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
		// length n/2, the resulting vector has length n/4 etc.
		// 2. Pairwise reduction:
		// Everything is the same except for an additional shuffle operation which
		// is used to produce operands for pairwise kind of reductions.
		// %val1 = shufflevector<n x t> %val, <n x t> %undef,
		// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
		// \-------------v----------/ \----------v------------/
		// n/2 elements n/2 elements
		// %val2 = shufflevector<n x t> %val, <n x t> %undef,
		// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
		// \-------------v----------/ \----------v------------/
		// n/2 elements n/2 elements
		// %red1 = op <n x t> %val1, <n x t> val2
		// Again, the operation is performed on <n x t> vector, but the resulting
		// vector %red1 is <n/2 x t> vector.
		//
		// The cost model should take into account that the actual length of the
		// vector is reduced on each iteration.
		unsigned MinMaxCost = 0;
		unsigned ShuffleCost = 0;
		auto ConcreteTTI = static_cast<T >(this);
		std::pair<unsigned, MVT> LT =
		ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
		unsigned LongVectorCount = 0;
		unsigned MVTLen =
		LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
		while (NumVecElts > MVTLen) {
		NumVecElts /= 2;
		// Assume the pairwise shuffles add a cost.
		ShuffleCost += (IsPairwise + 1) *
		ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
		NumVecElts, Ty);
		MinMaxCost +=
		ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy) +
		ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy);
		Ty = VectorType::get(ScalarTy, NumVecElts);
		CondTy = VectorType::get(ScalarCondTy, NumVecElts);
		++LongVectorCount;
		}
		// The minimal length of the vector is limited by the real length of vector
		RKSimonUnsubmitted Not Done Reply Inline Actions This seems the same as the comment before getArithmeticReductionCost - in which case is it worth keeping? RKSimon: This seems the same as the comment before getArithmeticReductionCost - in which case is it…
		// operations performed on the current platform. That's why several final
		// reduction opertions are perfomed on the vectors with the same
		// architecture-dependent length.
		ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
		ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
		NumVecElts, Ty);
		MinMaxCost +=
		(NumReduxLevels - LongVectorCount) *
		(ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy) +
		ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy));
		// Need 3 extractelement instructions for scalarization + an additional
		// scalar select instruction.
		return ShuffleCost + MinMaxCost +
		3 * getScalarizationOverhead(Ty, /Insert=/false,
		/Extract=/true) +
		static_cast<T *>(this)->getCmpSelInstrCost(Instruction::Select,
		ScalarTy, ScalarCondTy);
		}

unsigned getVectorSplitCost() { return 1; }		unsigned getVectorSplitCost() { return 1; }

/// @}		/// @}
};		};

/// \brief Concrete BasicTTIImpl that can be used if no further customization		/// \brief Concrete BasicTTIImpl that can be used if no further customization
/// is needed.		/// is needed.
class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {		class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
typedef BasicTTIImplBase<BasicTTIImpl> BaseT;		typedef BasicTTIImplBase<BasicTTIImpl> BaseT;
friend class BasicTTIImplBase<BasicTTIImpl>;		friend class BasicTTIImplBase<BasicTTIImpl>;

const TargetSubtargetInfo *ST;		const TargetSubtargetInfo *ST;
const TargetLoweringBase *TLI;		const TargetLoweringBase *TLI;

const TargetSubtargetInfo *getST() const { return ST; }		const TargetSubtargetInfo *getST() const { return ST; }
const TargetLoweringBase *getTLI() const { return TLI; }		const TargetLoweringBase *getTLI() const { return TLI; }

public:		public:
explicit BasicTTIImpl(const TargetMachine *ST, const Function &F);		explicit BasicTTIImpl(const TargetMachine *ST, const Function &F);
};		};

}		}

#endif		#endif
		RKSimonUnsubmitted Not Done Reply Inline Actions ConcreteTTI->getCmpSelInstrCost( RKSimon: ConcreteTTI->getCmpSelInstrCost(

include/llvm/Transforms/Vectorize/SLPVectorizer.h

Show First 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	private:
/// \@param BuildVector A list of users to ignore for the purpose of		/// \@param BuildVector A list of users to ignore for the purpose of
/// scheduling and that don't need extracting.		/// scheduling and that don't need extracting.
/// \returns true if a value was vectorized.		/// \returns true if a value was vectorized.
bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,		bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
ArrayRef<Value *> BuildVector = None,		ArrayRef<Value *> BuildVector = None,
bool AllowReorder = false);		bool AllowReorder = false);

/// \brief Try to vectorize a chain that may start at the operands of \V;		/// \brief Try to vectorize a chain that may start at the operands of \V;
bool tryToVectorize(BinaryOperator *V, slpvectorizer::BoUpSLP &R);		bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);

/// \brief Vectorize the store instructions collected in Stores.		/// \brief Vectorize the store instructions collected in Stores.
bool vectorizeStoreChains(slpvectorizer::BoUpSLP &R);		bool vectorizeStoreChains(slpvectorizer::BoUpSLP &R);

/// \brief Vectorize the index computations of the getelementptr instructions		/// \brief Vectorize the index computations of the getelementptr instructions
/// collected in GEPs.		/// collected in GEPs.
bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);		bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);

Show All 24 Lines

lib/Analysis/CostModel.cpp

Show All 18 Lines

#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/Passes.h"		#include "llvm/Analysis/Passes.h"
#include "llvm/Analysis/TargetTransformInfo.h"		#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/VectorUtils.h"		#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
		#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Value.h"		#include "llvm/IR/Value.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"		#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"		#include "llvm/Support/raw_ostream.h"
using namespace llvm;		using namespace llvm;
		using namespace PatternMatch;

#define CM_NAME "cost-model"		#define CM_NAME "cost-model"
#define DEBUG_TYPE CM_NAME		#define DEBUG_TYPE CM_NAME

static cl::opt<bool> EnableReduxCost("costmodel-reduxcost", cl::init(false),		static cl::opt<bool> EnableReduxCost("costmodel-reduxcost", cl::init(false),
cl::Hidden,		cl::Hidden,
cl::desc("Recognize reduction patterns."));		cl::desc("Recognize reduction patterns."));

▲ Show 20 Lines • Show All 116 Lines • ▼ Show 20 Lines	static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
// we look at the left or right side.		// we look at the left or right side.
for (unsigned i = 0, e = (1 << Level), val = !IsLeft; i != e; ++i, val += 2)		for (unsigned i = 0, e = (1 << Level), val = !IsLeft; i != e; ++i, val += 2)
Mask[i] = val;		Mask[i] = val;

SmallVector<int, 16> ActualMask = SI->getShuffleMask();		SmallVector<int, 16> ActualMask = SI->getShuffleMask();
return Mask == ActualMask;		return Mask == ActualMask;
}		}

static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp,		static unsigned getReductionOpcode(Value V, Value &L, Value *&R,
unsigned Level, unsigned NumLevels) {		Type *&CondTy) {
		L = nullptr;
		R = nullptr;
		CondTy = nullptr;
		RKSimonUnsubmitted Not Done Reply Inline Actions Do you need the this == &RD? Won't it always match on (Kind == RD.Kind && Opcode == RD.Opcode)? RKSimon: Do you need the this == &RD? Won't it always match on (Kind == RD.Kind && Opcode == RD.Opcode)?
		if (m_BinOp(m_Value(L), m_Value(R)).match(V))
		RKSimonUnsubmitted Not Done Reply Inline Actions They're not public, but maybe keep to style guide (also, maybe drop the class?) enum ReductionKind { RK_None, /// Not a reduction. RK_Arithmetic, /// Binary reduction data. RK_MinMax, /// Min/max reduction data. }; RKSimon: They're not public, but maybe keep to style guide (also, maybe drop the class?) ``` enum…
		return cast<BinaryOperator>(V)->getOpcode();
		if (auto *SI = dyn_cast<SelectInst>(V))
		if (m_UMin(m_Value(L), m_Value(R)).match(SI) \|\|
		m_SMin(m_Value(L), m_Value(R)).match(SI) \|\|
		m_SMax(m_Value(L), m_Value(R)).match(SI) \|\|
		m_UMax(m_Value(L), m_Value(R)).match(SI) \|\|
		m_OrdFMin(m_Value(L), m_Value(R)).match(SI) \|\|
		m_OrdFMax(m_Value(L), m_Value(R)).match(SI) \|\|
		m_UnordFMin(m_Value(L), m_Value(R)).match(SI) \|\|
		m_UnordFMax(m_Value(L), m_Value(R)).match(SI)) {
		auto *CI = cast<CmpInst>(SI->getCondition());
		CondTy = CI->getType();
		return CI->getOpcode();
		}
		return 0;
		}

		static bool matchPairwiseReductionAtLevel(Value *V, unsigned Level,
		unsigned NumLevels) {
// Match one level of pairwise operations.		// Match one level of pairwise operations.
// %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,		// %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
// <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>		// <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
// %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,		// %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
// <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>		// <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
// %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1		// %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
if (BinOp == nullptr)		if (!V)
return false;		return false;

assert(BinOp->getType()->isVectorTy() && "Expecting a vector type");		assert(V->getType()->isVectorTy() && "Expecting a vector type");

unsigned Opcode = BinOp->getOpcode();		Type *CondTy;
Value *L = BinOp->getOperand(0);		Value *L;
Value *R = BinOp->getOperand(1);		Value *R;
		unsigned Opcode = getReductionOpcode(V, L, R, CondTy);
		if (!Opcode)
		return false;

ShuffleVectorInst *LS = dyn_cast<ShuffleVectorInst>(L);		ShuffleVectorInst *LS = dyn_cast<ShuffleVectorInst>(L);
if (!LS && Level)		if (!LS && Level)
return false;		return false;
ShuffleVectorInst *RS = dyn_cast<ShuffleVectorInst>(R);		ShuffleVectorInst *RS = dyn_cast<ShuffleVectorInst>(R);
if (!RS && Level)		if (!RS && Level)
return false;		return false;

Show All 24 Lines	else if (NextLevelOpR && NextLevelOpR != L)
return false;		return false;

NextLevelOp = NextLevelOpL ? R : L;		NextLevelOp = NextLevelOpL ? R : L;
} else		} else
return false;		return false;

// Check that the next levels binary operation exists and matches with the		// Check that the next levels binary operation exists and matches with the
// current one.		// current one.
BinaryOperator *NextLevelBinOp = nullptr;		if (Level + 1 != NumLevels)
if (Level + 1 != NumLevels) {		if (Opcode != getReductionOpcode(NextLevelOp, L, R, CondTy))
if (!(NextLevelBinOp = dyn_cast<BinaryOperator>(NextLevelOp)))
return false;
else if (NextLevelBinOp->getOpcode() != Opcode)
return false;		return false;
}

// Shuffle mask for pairwise operation must match.		// Shuffle mask for pairwise operation must match.
if (matchPairwiseShuffleMask(LS, true, Level)) {		if (matchPairwiseShuffleMask(LS, /IsLeft=/true, Level)) {
if (!matchPairwiseShuffleMask(RS, false, Level))		if (!matchPairwiseShuffleMask(RS, /IsLeft=/false, Level))
return false;		return false;
} else if (matchPairwiseShuffleMask(RS, true, Level)) {		} else if (matchPairwiseShuffleMask(RS, /IsLeft=/true, Level)) {
if (!matchPairwiseShuffleMask(LS, false, Level))		if (!matchPairwiseShuffleMask(LS, /IsLeft=/false, Level))
return false;		return false;
} else		} else
return false;		return false;

if (++Level == NumLevels)		if (++Level == NumLevels)
return true;		return true;

// Match next level.		// Match next level.
return matchPairwiseReductionAtLevel(NextLevelBinOp, Level, NumLevels);		return matchPairwiseReductionAtLevel(NextLevelOp, Level, NumLevels);
}		}

static bool matchPairwiseReduction(const ExtractElementInst *ReduxRoot,		static bool matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
unsigned &Opcode, Type *&Ty) {		unsigned &Opcode, Type &Ty, Type &CondTy) {
if (!EnableReduxCost)		if (!EnableReduxCost)
return false;		return false;

// Need to extract the first element.		// Need to extract the first element.
ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));		ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
unsigned Idx = ~0u;		unsigned Idx = ~0u;
if (CI)		if (CI)
Idx = CI->getZExtValue();		Idx = CI->getZExtValue();
if (Idx != 0)		if (Idx != 0)
return false;		return false;

BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));		Value *L;
if (!RdxStart)		Value *R;
		Value *RdxStart = ReduxRoot->getOperand(0);
		unsigned RdxOpcode = getReductionOpcode(RdxStart, L, R, CondTy);
		if (RdxOpcode == 0)
return false;		return false;

Type *VecTy = ReduxRoot->getOperand(0)->getType();		Type *VecTy = RdxStart->getType();
unsigned NumVecElems = VecTy->getVectorNumElements();		unsigned NumVecElems = VecTy->getVectorNumElements();
if (!isPowerOf2_32(NumVecElems))		if (!isPowerOf2_32(NumVecElems))
return false;		return false;

// We look for a sequence of shuffle,shuffle,add triples like the following		// We look for a sequence of shuffle,shuffle,add triples like the following
// that builds a pairwise reduction tree.		// that builds a pairwise reduction tree.
//		//
// (X0, X1, X2, X3)		// (X0, X1, X2, X3)
Show All 9 Lines	static bool matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
// <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>		// <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
// %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,		// %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
// <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>		// <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
// %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1		// %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
// %r = extractelement <4 x float> %bin.rdx8, i32 0		// %r = extractelement <4 x float> %bin.rdx8, i32 0
if (!matchPairwiseReductionAtLevel(RdxStart, 0, Log2_32(NumVecElems)))		if (!matchPairwiseReductionAtLevel(RdxStart, 0, Log2_32(NumVecElems)))
return false;		return false;

Opcode = RdxStart->getOpcode();		Opcode = RdxOpcode;
Ty = VecTy;		Ty = VecTy;

return true;		return true;
}		}

static std::pair<Value , ShuffleVectorInst >		static std::pair<Value , ShuffleVectorInst >
getShuffleAndOtherOprd(BinaryOperator *B) {		getShuffleAndOtherOprd(Value L, Value R) {

Value *L = B->getOperand(0);
Value *R = B->getOperand(1);
ShuffleVectorInst *S = nullptr;		ShuffleVectorInst *S = nullptr;

if ((S = dyn_cast<ShuffleVectorInst>(L)))		if ((S = dyn_cast<ShuffleVectorInst>(L)))
return std::make_pair(R, S);		return std::make_pair(R, S);

S = dyn_cast<ShuffleVectorInst>(R);		S = dyn_cast<ShuffleVectorInst>(R);
return std::make_pair(L, S);		return std::make_pair(L, S);
}		}

static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,		static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
unsigned &Opcode, Type *&Ty) {		unsigned &Opcode, Type *&Ty,
		Type *&CondTy) {
		CondTy = nullptr;
if (!EnableReduxCost)		if (!EnableReduxCost)
return false;		return false;

// Need to extract the first element.		// Need to extract the first element.
ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));		ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
unsigned Idx = ~0u;		unsigned Idx = ~0u;
if (CI)		if (CI)
Idx = CI->getZExtValue();		Idx = CI->getZExtValue();
if (Idx != 0)		if (Idx != 0)
return false;		return false;

BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));		Value *L;
if (!RdxStart)		Value *R;
		Value *RdxStart = ReduxRoot->getOperand(0);
		unsigned RdxOpcode = getReductionOpcode(RdxStart, L, R, CondTy);
		if (RdxOpcode == 0)
return false;		return false;
unsigned RdxOpcode = RdxStart->getOpcode();

Type *VecTy = ReduxRoot->getOperand(0)->getType();		Type *VecTy = ReduxRoot->getOperand(0)->getType();
unsigned NumVecElems = VecTy->getVectorNumElements();		unsigned NumVecElems = VecTy->getVectorNumElements();
if (!isPowerOf2_32(NumVecElems))		if (!isPowerOf2_32(NumVecElems))
return false;		return false;

// We look for a sequence of shuffles and adds like the following matching one		// We look for a sequence of shuffles and adds like the following matching one
// fadd, shuffle vector pair at a time.		// fadd, shuffle vector pair at a time.
//		//
// %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef,		// %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef,
// <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>		// <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
// %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf		// %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
// %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef,		// %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef,
// <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>		// <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
// %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7		// %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
// %r = extractelement <4 x float> %bin.rdx8, i32 0		// %r = extractelement <4 x float> %bin.rdx8, i32 0

unsigned MaskStart = 1;		unsigned MaskStart = 1;
Value *RdxOp = RdxStart;		Value *RdxOp = RdxStart;
SmallVector<int, 32> ShuffleMask(NumVecElems, 0);		SmallVector<int, 32> ShuffleMask(NumVecElems, 0);
unsigned NumVecElemsRemain = NumVecElems;		unsigned NumVecElemsRemain = NumVecElems;
while (NumVecElemsRemain - 1) {		while (NumVecElemsRemain - 1) {
// Check for the right reduction operation.		// Check for the right reduction operation.
BinaryOperator *BinOp;		Value *Op = RdxOp;
if (!(BinOp = dyn_cast<BinaryOperator>(RdxOp)))		if (getReductionOpcode(Op, L, R, CondTy) != RdxOpcode)
return false;
if (BinOp->getOpcode() != RdxOpcode)
return false;		return false;

Value *NextRdxOp;		Value *NextRdxOp;
ShuffleVectorInst *Shuffle;		ShuffleVectorInst *Shuffle;
std::tie(NextRdxOp, Shuffle) = getShuffleAndOtherOprd(BinOp);		std::tie(NextRdxOp, Shuffle) = getShuffleAndOtherOprd(L, R);

// Check the current reduction operation and the shuffle use the same value.		// Check the current reduction operation and the shuffle use the same value.
if (Shuffle == nullptr)		if (Shuffle == nullptr)
return false;		return false;
if (Shuffle->getOperand(0) != NextRdxOp)		if (Shuffle->getOperand(0) != NextRdxOp)
return false;		return false;

// Check that shuffle masks matches.		// Check that shuffle masks matches.
▲ Show 20 Lines • Show All 99 Lines • ▼ Show 20 Lines	case Instruction::ExtractElement: {
unsigned Idx = -1;		unsigned Idx = -1;
if (CI)		if (CI)
Idx = CI->getZExtValue();		Idx = CI->getZExtValue();

// Try to match a reduction sequence (series of shufflevector and vector		// Try to match a reduction sequence (series of shufflevector and vector
// adds followed by a extractelement).		// adds followed by a extractelement).
unsigned ReduxOpCode;		unsigned ReduxOpCode;
Type *ReduxType;		Type *ReduxType;
		Type *CondType;

if (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType))		if (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType, CondType)) {
return TTI->getReductionCost(ReduxOpCode, ReduxType, false);		return CondType
else if (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType))		? TTI->getMinMaxReductionCost(ReduxType, CondType,
return TTI->getReductionCost(ReduxOpCode, ReduxType, true);		/IsPairwiseForm=/false)
		: TTI->getArithmeticReductionCost(ReduxOpCode, ReduxType,
		/IsPairwiseForm=/false);
		}
		if (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType, CondType)) {
		return CondType
		? TTI->getMinMaxReductionCost(ReduxType, CondType,
		/IsPairwiseForm=/true)
		: TTI->getArithmeticReductionCost(ReduxOpCode, ReduxType,
		/IsPairwiseForm=/true);
		}

return TTI->getVectorInstrCost(I->getOpcode(),		return TTI->getVectorInstrCost(I->getOpcode(),
EEI->getOperand(0)->getType(), Idx);		EEI->getOperand(0)->getType(), Idx);
}		}
case Instruction::InsertElement: {		case Instruction::InsertElement: {
const InsertElementInst * IE = cast<InsertElementInst>(I);		const InsertElementInst * IE = cast<InsertElementInst>(I);
ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));		ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
unsigned Idx = -1;		unsigned Idx = -1;
▲ Show 20 Lines • Show All 58 Lines • Show Last 20 Lines

lib/Analysis/TargetTransformInfo.cpp

	Show First 20 Lines • Show All 389 Lines • ▼ Show 20 Lines

	int TargetTransformInfo::getAddressComputationCost(Type *Tp,			int TargetTransformInfo::getAddressComputationCost(Type *Tp,
	bool IsComplex) const {			bool IsComplex) const {
	int Cost = TTIImpl->getAddressComputationCost(Tp, IsComplex);			int Cost = TTIImpl->getAddressComputationCost(Tp, IsComplex);
	assert(Cost >= 0 && "TTI should not produce negative costs!");			assert(Cost >= 0 && "TTI should not produce negative costs!");
	return Cost;			return Cost;
	}			}

	int TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty,			int TargetTransformInfo::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
	bool IsPairwiseForm) const {			bool IsPairwiseForm) const {
	int Cost = TTIImpl->getReductionCost(Opcode, Ty, IsPairwiseForm);			int Cost = TTIImpl->getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm);
				assert(Cost >= 0 && "TTI should not produce negative costs!");
				return Cost;
				}

				int TargetTransformInfo::getMinMaxReductionCost(Type Ty, Type CondTy,
				bool IsPairwiseForm) const {
				int Cost = TTIImpl->getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm);
	assert(Cost >= 0 && "TTI should not produce negative costs!");			assert(Cost >= 0 && "TTI should not produce negative costs!");
	return Cost;			return Cost;
	}			}

	unsigned			unsigned
	TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {			TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
	return TTIImpl->getCostOfKeepingLiveOverCall(Tys);			return TTIImpl->getCostOfKeepingLiveOverCall(Tys);
	}			}
	▲ Show 20 Lines • Show All 103 Lines • Show Last 20 Lines

lib/Target/X86/X86TargetTransformInfo.h

Show First 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	int getGatherScatterOpCost(unsigned Opcode, Type DataTy, Value Ptr,
bool VariableMask, unsigned Alignment);		bool VariableMask, unsigned Alignment);
int getAddressComputationCost(Type *PtrTy, bool IsComplex);		int getAddressComputationCost(Type *PtrTy, bool IsComplex);

int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,		int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> Tys, FastMathFlags FMF);		ArrayRef<Type *> Tys, FastMathFlags FMF);
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,		int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Value *> Args, FastMathFlags FMF);		ArrayRef<Value *> Args, FastMathFlags FMF);

int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);		int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
		bool IsPairwiseForm);

		int getMinMaxReductionCost(Type Ty, Type CondTy, bool IsPairwiseForm);

int getIntImmCost(int64_t);		int getIntImmCost(int64_t);

int getIntImmCost(const APInt &Imm, Type *Ty);		int getIntImmCost(const APInt &Imm, Type *Ty);

int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);		int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,		int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty);		Type *Ty);
Show All 20 Lines

lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 1,570 Lines • ▼ Show 20 Lines	int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
unsigned NumVectorInstToHideOverhead = 10;		unsigned NumVectorInstToHideOverhead = 10;

if (Ty->isVectorTy() && IsComplex)		if (Ty->isVectorTy() && IsComplex)
return NumVectorInstToHideOverhead;		return NumVectorInstToHideOverhead;

return BaseT::getAddressComputationCost(Ty, IsComplex);		return BaseT::getAddressComputationCost(Ty, IsComplex);
}		}

int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,		int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
bool IsPairwise) {		bool IsPairwise) {

std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);		std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);

MVT MTy = LT.second;		MVT MTy = LT.second;

int ISD = TLI->InstructionOpcodeToISD(Opcode);		int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");		assert(ISD && "Invalid opcode");

▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines	if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))		if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
return LT.first * Entry->Cost;		return LT.first * Entry->Cost;

if (ST->hasSSE42())		if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))		if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
return LT.first * Entry->Cost;		return LT.first * Entry->Cost;
}		}

return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);		return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
		}

		int X86TTIImpl::getMinMaxReductionCost(Type ValTy, Type CondTy,
		bool IsPairwise) {

		std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);

		MVT MTy = LT.second;

		int ISD = ValTy->isIntOrIntVectorTy() ? ISD::SMIN : ISD::FMINNUM;
		assert(ISD && "Invalid opcode");
		RKSimonUnsubmitted Done Reply Inline Actions Unnecessary? RKSimon: Unnecessary?
		ABataevAuthorUnsubmitted Not Done Reply Inline Actions Yes, missed it, thanks. ABataev: Yes, missed it, thanks.

		// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
		// and make it as the cost.

		static const CostTblEntry SSE42CostTblPairWise[] = {
		{ ISD::FMINNUM, MVT::v2f64, 3 },
		{ ISD::FMINNUM, MVT::v4f32, 2 },
		{ ISD::SMIN, MVT::v2i64, 7 }, // The data reported by the IACA is "6.8"
		{ ISD::SMIN, MVT::v4i32, 1 }, // The data reported by the IACA is "1.5"
		{ ISD::SMIN, MVT::v8i16, 2 },
		};

		static const CostTblEntry AVX1CostTblPairWise[] = {
		{ ISD::FMINNUM, MVT::v4f32, 1 },
		{ ISD::FMINNUM, MVT::v4f64, 1 },
		{ ISD::FMINNUM, MVT::v8f32, 2 },
		{ ISD::SMIN, MVT::v2i64, 3 },
		{ ISD::SMIN, MVT::v4i32, 1 },
		{ ISD::SMIN, MVT::v8i16, 1 },
		{ ISD::SMIN, MVT::v8i32, 3 },
		};

		static const CostTblEntry AVX2CostTblPairWise[] = {
		{ ISD::SMIN, MVT::v4i64, 2 },
		{ ISD::SMIN, MVT::v8i32, 1 },
		{ ISD::SMIN, MVT::v16i16, 1 },
		{ ISD::SMIN, MVT::v32i8, 2 },
		};

		static const CostTblEntry AVX512CostTblPairWise[] = {
		{ ISD::FMINNUM, MVT::v8f64, 1 },
		{ ISD::FMINNUM, MVT::v16f32, 2 },
		{ ISD::SMIN, MVT::v8i64, 2 },
		{ ISD::SMIN, MVT::v16i32, 1 },
		};

		static const CostTblEntry SSE42CostTblNoPairWise[] = {
		{ ISD::FMINNUM, MVT::v2f64, 3 },
		{ ISD::FMINNUM, MVT::v4f32, 3 },
		{ ISD::SMIN, MVT::v2i64, 7 }, // The data reported by the IACA is "6.8"
		{ ISD::SMIN, MVT::v4i32, 1 }, // The data reported by the IACA is "1.5"
		{ ISD::SMIN, MVT::v8i16, 1 }, // The data reported by the IACA is "1.5"
		};

		static const CostTblEntry AVX1CostTblNoPairWise[] = {
		{ ISD::FMINNUM, MVT::v4f32, 1 },
		{ ISD::FMINNUM, MVT::v4f64, 1 },
		{ ISD::FMINNUM, MVT::v8f32, 1 },
		{ ISD::SMIN, MVT::v2i64, 3 },
		{ ISD::SMIN, MVT::v4i32, 1 },
		{ ISD::SMIN, MVT::v8i16, 1 },
		{ ISD::SMIN, MVT::v8i32, 2 },
		};

		static const CostTblEntry AVX2CostTblNoPairWise[] = {
		{ ISD::SMIN, MVT::v4i64, 1 },
		{ ISD::SMIN, MVT::v8i32, 1 },
		{ ISD::SMIN, MVT::v16i16, 1 },
		{ ISD::SMIN, MVT::v32i8, 1 },
		};

		static const CostTblEntry AVX512CostTblNoPairWise[] = {
		{ ISD::FMINNUM, MVT::v8f64, 1 },
		{ ISD::FMINNUM, MVT::v16f32, 2 },
		{ ISD::SMIN, MVT::v8i64, 1 },
		{ ISD::SMIN, MVT::v16i32, 1 },
		};

		if (IsPairwise) {
		if (ST->hasAVX512())
		if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
		return LT.first * Entry->Cost;

		if (ST->hasAVX2())
		if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
		return LT.first * Entry->Cost;

		if (ST->hasAVX())
		if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
		return LT.first * Entry->Cost;

		if (ST->hasSSE42())
		if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
		return LT.first * Entry->Cost;
		} else {
		if (ST->hasAVX512())
		if (const auto *Entry =
		CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
		return LT.first * Entry->Cost;

		if (ST->hasAVX2())
		if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
		return LT.first * Entry->Cost;

		if (ST->hasAVX())
		if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
		return LT.first * Entry->Cost;

		if (ST->hasSSE42())
		if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
		return LT.first * Entry->Cost;
		}

		return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise);
}		}

/// \brief Calculate the cost of materializing a 64-bit value. This helper		/// \brief Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it		/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.		/// is valid to return a cost of ZERO.
int X86TTIImpl::getIntImmCost(int64_t Val) {		int X86TTIImpl::getIntImmCost(int64_t Val) {
if (Val == 0)		if (Val == 0)
return TTI::TCC_Free;		return TTI::TCC_Free;

if (isInt<32>(Val))		if (isInt<32>(Val))
return TTI::TCC_Basic;		return TTI::TCC_Basic;

return 2 * TTI::TCC_Basic;		return 2 * TTI::TCC_Basic;
}		}

		RKSimonUnsubmitted Not Done Reply Inline Actions One cost entry per line RKSimon: One cost entry per line
int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {		int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());		assert(Ty->isIntegerTy());

unsigned BitSize = Ty->getPrimitiveSizeInBits();		unsigned BitSize = Ty->getPrimitiveSizeInBits();
if (BitSize == 0)		if (BitSize == 0)
return ~0U;		return ~0U;

// Never hoist constants larger than 128bit, because this might lead to		// Never hoist constants larger than 128bit, because this might lead to
▲ Show 20 Lines • Show All 344 Lines • Show Last 20 Lines

lib/Transforms/Vectorize/SLPVectorizer.cpp

Show All 28 Lines
#include "llvm/Analysis/VectorUtils.h"		#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/DataLayout.h"		#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"		#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"		#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"		#include "llvm/IR/NoFolder.h"
		#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"		#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"		#include "llvm/IR/Value.h"
#include "llvm/IR/Verifier.h"		#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"		#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"		#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Vectorize.h"		#include "llvm/Transforms/Vectorize.h"
#include <algorithm>		#include <algorithm>
#include <memory>		#include <memory>

using namespace llvm;		using namespace llvm;
		using namespace llvm::PatternMatch;
using namespace slpvectorizer;		using namespace slpvectorizer;

#define SV_NAME "slp-vectorizer"		#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"		#define DEBUG_TYPE "SLP"

STATISTIC(NumVectorInstructions, "Number of vector instructions generated");		STATISTIC(NumVectorInstructions, "Number of vector instructions generated");

static cl::opt<int>		static cl::opt<int>
▲ Show 20 Lines • Show All 3,959 Lines • ▼ Show 20 Lines	for (unsigned I = NextInst; I < MaxInst; ++I) {
Changed = true;		Changed = true;
}		}
}		}
}		}

return Changed;		return Changed;
}		}

bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {		bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (!V)		if (!I)
		return false;

		if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
return false;		return false;

Value *P = V->getParent();		Value *P = I->getParent();

// Vectorize in current basic block only.		// Vectorize in current basic block only.
auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));		auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));		auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
if (!Op0 \|\| !Op1 \|\| Op0->getParent() != P \|\| Op1->getParent() != P)		if (!Op0 \|\| !Op1 \|\| Op0->getParent() != P \|\| Op1->getParent() != P)
return false;		return false;

// Try to vectorize V.		// Try to vectorize V.
if (tryToVectorizePair(Op0, Op1, R))		if (tryToVectorizePair(Op0, Op1, R))
return true;		return true;

auto *A = dyn_cast<BinaryOperator>(Op0);		auto *A = dyn_cast<BinaryOperator>(Op0);
▲ Show 20 Lines • Show All 77 Lines • ▼ Show 20 Lines
/// +		/// +
/// \|		/// \|
/// *p =		/// *p =
///		///
class HorizontalReduction {		class HorizontalReduction {
SmallVector<Value *, 16> ReductionOps;		SmallVector<Value *, 16> ReductionOps;
SmallVector<Value *, 32> ReducedVals;		SmallVector<Value *, 32> ReducedVals;

BinaryOperator *ReductionRoot;		struct OperationData {
		enum MinMaxIntFloat {
		IntMin = Instruction::BinaryOpsEnd,
		RKSimonUnsubmitted Not Done Reply Inline Actions Call this RK_None to match the other version? RKSimon: Call this RK_None to match the other version?
		IntUMin,
		FloatMin,
		IntMax,
		IntUMax,
		FloatMax
		RKSimonUnsubmitted Not Done Reply Inline Actions Same as above: enum ReductionKind { RK_Not, /// Not a reduction. RK_Arithmetic, /// Binary reduction data. RK_Min, /// Minimum reduction data. RK_UMin, /// Unsigned minimum reduction data. RK_Max, /// Maximum reduction data. RK_UMax, /// Unsigned maximum reduction data. }; RKSimon: Same as above: ``` enum ReductionKind { RK_Not, /// Not a reduction.
		};
		bool Validity = false;
		unsigned Opcode = 0;
		Value *LHS = nullptr;
		Value *RHS = nullptr;
		Type *CondTy = nullptr;

		public:
		OperationData() = default;
		OperationData(Value *V) {
		if (auto *I = dyn_cast<Instruction>(V)) {
		Validity = true;
		Opcode = I->getOpcode();
		}
		}
		OperationData(unsigned Opcode, Value LHS, Value RHS)
		: Validity(true), Opcode(Opcode), LHS(LHS), RHS(RHS) {}
		OperationData(Value LHS, Value RHS, Type *CondTy, bool IsMaximum,
		bool IsUnsigned = false)
		: Validity(true), LHS(LHS), RHS(RHS), CondTy(CondTy) {
		if (LHS->getType()->isIntegerTy()) {
		if (IsUnsigned)
		Opcode = IsMaximum ? IntUMax : IntUMin;
		else
		Opcode = IsMaximum ? IntMax : IntMin;
		} else
		Opcode = IsMaximum ? FloatMax : FloatMin;
		}
		operator bool() const { return Validity; }
		bool isBinOp() const {
		return Validity && LHS && RHS && Instruction::isBinaryOp(Opcode);
		}
		bool isMinMax() const {
		return Validity && LHS && RHS && Opcode >= IntMin && Opcode <= FloatMax;
		}
		bool isVectorizable() const { return Validity && LHS && RHS; }
		bool operator==(const OperationData &OD) {
		return this == &OD \|\| (Validity == OD.Validity && (!LHS == !OD.LHS) &&
		(!RHS == !OD.RHS) && Opcode == OD.Opcode);
		}
		bool operator!=(const OperationData &OD) { return !(*this == OD); }
		void clear() {
		Validity = false;
		LHS = nullptr;
		RHS = nullptr;
		Opcode = 0;
		}
		unsigned getOpcode() const {
		assert(isVectorizable());
		if (isBinOp())
		return Opcode;
		switch (Opcode) {
		case FloatMax:
		case FloatMin:
		return Instruction::FCmp;
		case IntMin:
		case IntUMin:
		case IntMax:
		case IntUMax:
		return Instruction::ICmp;
		default:
		break;
		}
		llvm_unreachable("Unexpected opcode");
		}
		Value *getLHS() const { return LHS; }
		Value *getRHS() const { return RHS; }
		Type *getConditionType() const { return CondTy; }
		bool isFloatMinMax() const {
		return isMinMax() && (Opcode == FloatMin \|\| Opcode == FloatMax);
		}
		bool isIntMinMax() const {
		return isMinMax() && (Opcode == IntMin \|\| Opcode == IntMax \|\|
		Opcode == IntUMin \|\| Opcode == IntUMax);
		}
		Value createOp(IRBuilder<> &Builder, Value L, Value *R,
		const Twine &Name = "") const {
		if (isBinOp()) {
		assert(Opcode == Instruction::FAdd \|\| Opcode == Instruction::Add);
		if (Opcode == Instruction::FAdd)
		return Builder.CreateFAdd(L, R, Name);
		return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
		}
		assert(Opcode >= OperationData::IntMin &&
		Opcode <= OperationData::FloatMax);
		Value *Cmp;
		switch (Opcode) {
		case OperationData::IntMin:
		Cmp = Builder.CreateICmpSLT(L, R);
		break;
		case OperationData::IntUMin:
		Cmp = Builder.CreateICmpULT(L, R);
		break;
		case OperationData::FloatMin:
		Cmp = Builder.CreateFCmpOLT(L, R);
		break;
		case OperationData::IntMax:
		Cmp = Builder.CreateICmpSGT(L, R);
		break;
		case OperationData::IntUMax:
		Cmp = Builder.CreateICmpUGT(L, R);
		break;
		case OperationData::FloatMax:
		Cmp = Builder.CreateFCmpOGT(L, R);
		break;
		default:
		llvm_unreachable("Unknown operation");
		}
		return Builder.CreateSelect(Cmp, L, R, Name);
		}
		};

		Instruction *ReductionRoot;
// After successfull horizontal reduction vectorization attempt for PHI node		// After successfull horizontal reduction vectorization attempt for PHI node
// vectorizer tries to update root binary op by combining vectorized tree and		// vectorizer tries to update root binary op by combining vectorized tree and
// the ReductionPHI node. But during vectorization this ReductionPHI can be		// the ReductionPHI node. But during vectorization this ReductionPHI can be
// vectorized itself and replaced by the undef value, while the instruction		// vectorized itself and replaced by the undef value, while the instruction
// itself is marked for deletion. This 'marked for deletion' PHI node then can		// itself is marked for deletion. This 'marked for deletion' PHI node then can
// be used in new binary operation, causing "Use still stuck around after Def		// be used in new binary operation, causing "Use still stuck around after Def
// is destroyed" crash upon PHI node deletion.		// is destroyed" crash upon PHI node deletion.
WeakVH ReductionPHI;		WeakVH ReductionPHI;

/// The opcode of the reduction.		/// The operation data of the reduction operation.
unsigned ReductionOpcode;		OperationData ReductionData;
/// The opcode of the values we perform a reduction on.		/// The operation data of the values we perform a reduction on.
unsigned ReducedValueOpcode;		OperationData ReducedValueData;
/// Should we model this reduction as a pairwise reduction tree or a tree that		/// Should we model this reduction as a pairwise reduction tree or a tree that
/// splits the vector in halves and adds those halves.		/// splits the vector in halves and adds those halves.
bool IsPairwiseReduction;		bool IsPairwiseReduction;

		static OperationData getOperationData(Value *V) {
		if (!V)
		return OperationData();

		Value *LHS;
		Value *RHS;
		if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V))
		return {cast<BinaryOperator>(V)->getOpcode(), LHS, RHS};
		if (auto *Select = dyn_cast<SelectInst>(V)) {
		// Look for a min/max pattern.
		if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select))
		return {LHS, RHS, Select->getCondition()->getType(),
		/IsMaximum=/false, /IsUnsigned=/true};
		else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select))
		return {LHS, RHS, Select->getCondition()->getType(),
		/IsMaximum=/false, /IsUnsigned=/false};
		else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) \|\|
		m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select))
		return {LHS, RHS, Select->getCondition()->getType(),
		/IsMaximum=/false};
		else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select))
		return {LHS, RHS, Select->getCondition()->getType(), /IsMaximum=/true,
		/IsUnsigned=/true};
		else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select))
		return {LHS, RHS, Select->getCondition()->getType(), /IsMaximum=/true,
		/IsUnsigned=/false};
		else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) \|\|
		m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select))
		return {LHS, RHS, Select->getCondition()->getType(),
		/IsMaximum=/true};
		}
		return {V};
		}

public:		public:
/// The width of one full horizontal reduction operation.		/// The width of one full horizontal reduction operation.
unsigned ReduxWidth;		unsigned ReduxWidth;

/// Minimal width of available vector registers. It's used to determine		/// Minimal width of available vector registers. It's used to determine
/// ReduxWidth.		/// ReduxWidth.
unsigned MinVecRegSize;		unsigned MinVecRegSize;

HorizontalReduction(unsigned MinVecRegSize)		HorizontalReduction(unsigned MinVecRegSize)
: ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),		: ReductionRoot(nullptr), IsPairwiseReduction(false), ReduxWidth(0),
IsPairwiseReduction(false), ReduxWidth(0),
MinVecRegSize(MinVecRegSize) {}		MinVecRegSize(MinVecRegSize) {}

/// \brief Try to find a reduction tree.		/// \brief Try to find a reduction tree.
bool matchAssociativeReduction(PHINode Phi, BinaryOperator B) {		bool matchAssociativeReduction(PHINode Phi, Instruction B) {
assert((!Phi \|\| is_contained(Phi->operands(), B)) &&		assert((!Phi \|\| is_contained(Phi->operands(), B)) &&
"Thi phi needs to use the binary operator");		"Thi phi needs to use the binary operator");

		ReductionData = getOperationData(B);

// We could have a initial reductions that is not an add.		// We could have a initial reductions that is not an add.
// r *= v1 + v2 + v3 + v4		// r *= v1 + v2 + v3 + v4
// In such a case start looking for a tree rooted in the first '+'.		// In such a case start looking for a tree rooted in the first '+'.
if (Phi) {		if (Phi && ReductionData.isVectorizable()) {
if (B->getOperand(0) == Phi) {		if (ReductionData.getLHS() == Phi) {
Phi = nullptr;		Phi = nullptr;
B = dyn_cast<BinaryOperator>(B->getOperand(1));		B = dyn_cast<Instruction>(ReductionData.getRHS());
} else if (B->getOperand(1) == Phi) {		ReductionData = getOperationData(B);
		} else if (ReductionData.getRHS() == Phi) {
Phi = nullptr;		Phi = nullptr;
B = dyn_cast<BinaryOperator>(B->getOperand(0));		B = dyn_cast<Instruction>(ReductionData.getLHS());
		ReductionData = getOperationData(B);
}		}
}		}

if (!B)		if (!B \|\| !ReductionData.isVectorizable())
return false;		return false;

Type *Ty = B->getType();		Type *Ty = B->getType();
if (!isValidElementType(Ty))		if (!isValidElementType(Ty))
return false;		return false;

const DataLayout &DL = B->getModule()->getDataLayout();		const DataLayout &DL = B->getModule()->getDataLayout();
ReductionOpcode = B->getOpcode();		ReducedValueData.clear();
ReducedValueOpcode = 0;
// FIXME: Register size should be a parameter to this function, so we can		// FIXME: Register size should be a parameter to this function, so we can
// try different vectorization factors.		// try different vectorization factors.
ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);		ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
ReductionRoot = B;		ReductionRoot = B;
ReductionPHI = Phi;		ReductionPHI = Phi;

if (ReduxWidth < 4)		if (ReduxWidth < 4)
return false;		return false;

// We currently only support adds.		// We currently only support adds and min/max.
if (ReductionOpcode != Instruction::Add &&		if (ReductionData.getOpcode() != Instruction::Add &&
ReductionOpcode != Instruction::FAdd)		ReductionData.getOpcode() != Instruction::FAdd &&
		!ReductionData.isMinMax())
return false;		return false;

// Post order traverse the reduction tree starting at B. We only handle true		// Post order traverse the reduction tree starting at B. We only handle true
// trees containing only binary operators or selects.		// trees containing only binary operators or selects.
		bool IsBinOp = ReductionData.isBinOp();
SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;		SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
Stack.push_back(std::make_pair(B, 0));		Stack.push_back(std::make_pair(B, IsBinOp ? 0 : 1));
while (!Stack.empty()) {		while (!Stack.empty()) {
Instruction *TreeN = Stack.back().first;		Instruction *TreeN = Stack.back().first;
unsigned EdgeToVist = Stack.back().second++;		unsigned EdgeToVist = Stack.back().second++;
bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;		OperationData OpData = getOperationData(TreeN);
		bool IsReducedValue = OpData != ReductionData;

// Only handle trees in the current basic block.		// Only handle trees in the current basic block.
if (TreeN->getParent() != B->getParent())		if (TreeN->getParent() != B->getParent())
return false;		return false;

// Each tree node needs to have one user except for the ultimate		// Each tree node needs to have one user except for the ultimate
// reduction.		// reduction.
if (!TreeN->hasOneUse() && TreeN != B)		if (!TreeN->hasOneUse() && (IsBinOp \|\| !TreeN->hasNUses(2)) && TreeN != B)
return false;		return false;

// Postorder vist.		// Postorder vist.
if (EdgeToVist == 2 \|\| IsReducedValue) {		if (((IsBinOp && EdgeToVist == 2) \|\|
		(OpData && OpData.isMinMax() && EdgeToVist == 3)) \|\|
		IsReducedValue) {
if (IsReducedValue) {		if (IsReducedValue) {
// Make sure that the opcodes of the operations that we are going to		// Make sure that the opcodes of the operations that we are going to
// reduce match.		// reduce match.
if (!ReducedValueOpcode)		if (!ReducedValueData)
ReducedValueOpcode = TreeN->getOpcode();		ReducedValueData = OpData;
else if (ReducedValueOpcode != TreeN->getOpcode())		else if (ReducedValueData != OpData)
return false;		return false;
ReducedVals.push_back(TreeN);		ReducedVals.push_back(TreeN);
} else {		} else {
// We need to be able to reassociate the adds.		// We need to be able to reassociate the adds.
if (!TreeN->isAssociative())		if (!TreeN->isAssociative() &&
		!(OpData.isFloatMinMax() &&
		cast<Instruction>(TreeN->getOperand(0))->hasUnsafeAlgebra()) &&
		!OpData.isIntMinMax())
return false;		return false;
ReductionOps.push_back(TreeN);		ReductionOps.push_back(TreeN);
}		}
// Retract.		// Retract.
Stack.pop_back();		Stack.pop_back();
continue;		continue;
}		}

// Visit left or right.		// Visit left or right.
Value *NextV = TreeN->getOperand(EdgeToVist);		Value *NextV = TreeN->getOperand(EdgeToVist);
if (NextV != Phi) {		if (NextV != Phi) {
auto *I = dyn_cast<Instruction>(NextV);		auto *I = dyn_cast<Instruction>(NextV);
		OpData = getOperationData(I);
// Continue analysis if the next operand is a reduction operation or		// Continue analysis if the next operand is a reduction operation or
// (possibly) a reduced value. If the reduced value opcode is not set,		// (possibly) a reduced value. If the reduced value opcode is not set,
// the first met operation != reduction operation is considered as the		// the first met operation != reduction operation is considered as the
// reduced value class.		// reduced value class.
if (I && (!ReducedValueOpcode \|\| I->getOpcode() == ReducedValueOpcode \|\|		if (I && (!ReducedValueData \|\| OpData == ReducedValueData \|\|
I->getOpcode() == ReductionOpcode)) {		OpData == ReductionData)) {
if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)		if (!ReducedValueData && OpData != ReductionData)
ReducedValueOpcode = I->getOpcode();		ReducedValueData = OpData;
Stack.push_back(std::make_pair(I, 0));		Stack.push_back(std::make_pair(I, OpData.isMinMax() ? 1 : 0));
continue;		continue;
}		}
return false;		return false;
}		}
}		}
return true;		return true;
}		}

Show All 37 Lines	for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
// Vectorize a tree.		// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();		DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
Value *VectorizedRoot = V.vectorizeTree();		Value *VectorizedRoot = V.vectorizeTree();

// Emit a reduction.		// Emit a reduction.
Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);		Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
if (VectorizedTree) {		if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);		Builder.SetCurrentDebugLocation(Loc);
VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,		VectorizedTree = ReductionData.createOp(Builder, VectorizedTree,
ReducedSubTree, "bin.rdx");		ReducedSubTree, "bin.rdx");
} else		} else
VectorizedTree = ReducedSubTree;		VectorizedTree = ReducedSubTree;
}		}

if (VectorizedTree) {		if (VectorizedTree) {
// Finish the reduction.		// Finish the reduction.
for (; i < NumReducedVals; ++i) {		for (; i < NumReducedVals; ++i) {
Builder.SetCurrentDebugLocation(		Builder.SetCurrentDebugLocation(
cast<Instruction>(ReducedVals[i])->getDebugLoc());		cast<Instruction>(ReducedVals[i])->getDebugLoc());
VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,		VectorizedTree =
ReducedVals[i]);		ReductionData.createOp(Builder, VectorizedTree, ReducedVals[i]);
}		}
// Update users.		// Update users.
if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {		if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
assert(ReductionRoot && "Need a reduction operation");		assert(ReductionRoot && "Need a reduction operation");
		if (ReductionData.isBinOp()) {
ReductionRoot->setOperand(0, VectorizedTree);		ReductionRoot->setOperand(0, VectorizedTree);
ReductionRoot->setOperand(1, ReductionPHI);		ReductionRoot->setOperand(1, ReductionPHI);
		} else {
		auto *Cmp = cast<CmpInst>(ReductionRoot->getOperand(1));
		Cmp->setOperand(0, VectorizedTree);
		Cmp->setOperand(1, ReductionPHI);
		ReductionRoot->setOperand(1, VectorizedTree);
		ReductionRoot->setOperand(2, ReductionPHI);
		}
} else		} else
ReductionRoot->replaceAllUsesWith(VectorizedTree);		ReductionRoot->replaceAllUsesWith(VectorizedTree);
}		}
return VectorizedTree != nullptr;		return VectorizedTree != nullptr;
}		}

unsigned numReductionValues() const {		unsigned numReductionValues() const {
return ReducedVals.size();		return ReducedVals.size();
}		}

private:		private:
/// \brief Calculate the cost of a reduction.		/// \brief Calculate the cost of a reduction.
int getReductionCost(TargetTransformInfo TTI, Value FirstReducedVal) {		int getReductionCost(TargetTransformInfo TTI, Value FirstReducedVal) {
Type *ScalarTy = FirstReducedVal->getType();		Type *ScalarTy = FirstReducedVal->getType();
Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);		Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
		Type *ScalarCondTy = ReductionData.getConditionType();
int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);		Type *VecCondTy =
int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);		ScalarCondTy ? VectorType::get(ScalarCondTy, ReduxWidth) : nullptr;

		int PairwiseRdxCost =
		ReductionData.isMinMax()
		? TTI->getMinMaxReductionCost(VecTy, VecCondTy,
		/IsPairwiseForm=/true)
		: TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
		/IsPairwiseForm=/true);
		int SplittingRdxCost =
		ReductionData.isMinMax()
		? TTI->getMinMaxReductionCost(VecTy, VecCondTy,
		/IsPairwiseForm=/false)
		: TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
		/IsPairwiseForm=/false);

IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;		IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;		int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;

int ScalarReduxCost =		int ScalarReduxCost;
		if (ReductionData.isBinOp()) {
		ScalarReduxCost =
(ReduxWidth - 1) *		(ReduxWidth - 1) *
TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);		TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
		} else {
		assert(ReductionData.isMinMax());
		ScalarReduxCost =
		(ReduxWidth - 1) *
		(TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
		TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
		ScalarCondTy));
		}

DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost		DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
<< " for reduction that starts with " << *FirstReducedVal		<< " for reduction that starts with " << *FirstReducedVal
<< " (It is a "		<< " (It is a "
<< (IsPairwiseReduction ? "pairwise" : "splitting")		<< (IsPairwiseReduction ? "pairwise" : "splitting")
<< " reduction)\n");		<< " reduction)\n");

return VecReduxCost - ScalarReduxCost;		return VecReduxCost - ScalarReduxCost;
}		}

static Value createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value L,
Value *R, const Twine &Name = "") {
if (Opcode == Instruction::FAdd)
return Builder.CreateFAdd(L, R, Name);
return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
}

/// \brief Emit a horizontal reduction of the vectorized value.		/// \brief Emit a horizontal reduction of the vectorized value.
Value emitReduction(Value VectorizedValue, IRBuilder<> &Builder) {		Value emitReduction(Value VectorizedValue, IRBuilder<> &Builder) {
assert(VectorizedValue && "Need to have a vectorized tree node");		assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&		assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");		"We only handle power-of-two reductions for now");

Value *TmpVec = VectorizedValue;		Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {		for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
if (IsPairwiseReduction) {		if (IsPairwiseReduction) {
Value *LeftMask =		Value *LeftMask =
createRdxShuffleMask(ReduxWidth, i, true, true, Builder);		createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
Value *RightMask =		Value *RightMask =
createRdxShuffleMask(ReduxWidth, i, true, false, Builder);		createRdxShuffleMask(ReduxWidth, i, true, false, Builder);

Value *LeftShuf = Builder.CreateShuffleVector(		Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");		TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
Value *RightShuf = Builder.CreateShuffleVector(		Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),		TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");		"rdx.shuf.r");
TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,		TmpVec =
"bin.rdx");		ReductionData.createOp(Builder, LeftShuf, RightShuf, "bin.rdx");
} else {		} else {
Value *UpperHalf =		Value *UpperHalf =
createRdxShuffleMask(ReduxWidth, i, false, false, Builder);		createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
Value *Shuf = Builder.CreateShuffleVector(		Value *Shuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");		TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");		TmpVec = ReductionData.createOp(Builder, TmpVec, Shuf, "bin.rdx");
}		}
}		}

// The result is in the first element of the vector.		// The result is in the first element of the vector.
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));		return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
}		}
};		};
} // end anonymous namespace		} // end anonymous namespace
▲ Show 20 Lines • Show All 154 Lines • ▼ Show 20 Lines
} // namespace		} // namespace

/// \brief Attempt to reduce a horizontal reduction.		/// \brief Attempt to reduce a horizontal reduction.
/// If it is legal to match a horizontal reduction feeding		/// If it is legal to match a horizontal reduction feeding
/// the phi node P with reduction operators Root in a basic block BB, then check		/// the phi node P with reduction operators Root in a basic block BB, then check
/// if it can be done.		/// if it can be done.
/// \returns true if a horizontal reduction was matched and reduced.		/// \returns true if a horizontal reduction was matched and reduced.
/// \returns false if a horizontal reduction was not matched.		/// \returns false if a horizontal reduction was not matched.
static bool canBeVectorized(		static bool
PHINode P, Instruction Root, BasicBlock *BB, BoUpSLP &R,		canBeVectorized(PHINode P, Instruction Root, BasicBlock *BB, BoUpSLP &R,
TargetTransformInfo *TTI,		TargetTransformInfo *TTI,
const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {		const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)		if (!ShouldVectorizeHor)
return false;		return false;

if (!Root)		if (!Root)
return false;		return false;

if (Root->getParent() != BB)		if (Root->getParent() != BB)
return false;		return false;
SmallVector<WeakVHWithLevel, 8> Stack(1, Root);		SmallVector<WeakVHWithLevel, 8> Stack(1, Root);
SmallSet<Value *, 8> VisitedInstrs;		SmallSet<Value *, 8> VisitedInstrs;
bool Res = false;		bool Res = false;
while (!Stack.empty()) {		while (!Stack.empty()) {
Value *V = Stack.back();		Value *V = Stack.back();
if (!V) {		if (!V) {
Stack.pop_back();		Stack.pop_back();
continue;		continue;
}		}
auto *Inst = dyn_cast<Instruction>(V);		auto *Inst = dyn_cast<Instruction>(V);
if (!Inst \|\| isa<PHINode>(Inst)) {		if (!Inst \|\| isa<PHINode>(Inst)) {
Stack.pop_back();		Stack.pop_back();
continue;		continue;
}		}
if (Stack.back().isInitial()) {		if (Stack.back().isInitial()) {
Stack.back().clearInitial();		Stack.back().clearInitial();
if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {		auto *BI = dyn_cast<BinaryOperator>(Inst);
		auto *SI = dyn_cast<SelectInst>(Inst);
		if (BI \|\| SI) {
HorizontalReduction HorRdx(R.getMinVecRegSize());		HorizontalReduction HorRdx(R.getMinVecRegSize());
if (HorRdx.matchAssociativeReduction(P, BI)) {		if (HorRdx.matchAssociativeReduction(P, Inst)) {
// If there is a sufficient number of reduction values, reduce		// If there is a sufficient number of reduction values, reduce
// to a nearby power-of-2. Can safely generate oversized		// to a nearby power-of-2. Can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.		// vectors and rely on the backend to split them to legal sizes.
HorRdx.ReduxWidth =		HorRdx.ReduxWidth =
std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));		std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));

if (HorRdx.tryToReduce(R, TTI)) {		if (HorRdx.tryToReduce(R, TTI)) {
Res = true;		Res = true;
P = nullptr;		P = nullptr;
continue;		continue;
}		}
}		}
if (P) {		if (P && BI) {
Inst = dyn_cast<Instruction>(BI->getOperand(0));		Inst = dyn_cast<Instruction>(BI->getOperand(0));
if (Inst == P)		if (Inst == P)
Inst = dyn_cast<Instruction>(BI->getOperand(1));		Inst = dyn_cast<Instruction>(BI->getOperand(1));
if (!Inst) {		if (!Inst) {
P = nullptr;		P = nullptr;
continue;		continue;
}		}
}		}
}		}
P = nullptr;		P = nullptr;
if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {		if (Vectorize(Inst, R)) {
Res = true;		Res = true;
continue;		continue;
}		}
}		}
if (Stack.back().isFinal()) {		if (Stack.back().isFinal()) {
Stack.pop_back();		Stack.pop_back();
continue;		continue;
}		}
Show All 14 Lines	bool SLPVectorizerPass::vectorizeRootInstruction(PHINode P, Value V,
auto *I = dyn_cast<Instruction>(V);		auto *I = dyn_cast<Instruction>(V);
if (!I)		if (!I)
return false;		return false;

if (!isa<BinaryOperator>(I))		if (!isa<BinaryOperator>(I))
P = nullptr;		P = nullptr;
// Try to match and vectorize a horizontal reduction.		// Try to match and vectorize a horizontal reduction.
return canBeVectorized(P, I, BB, R, TTI,		return canBeVectorized(P, I, BB, R, TTI,
[this](BinaryOperator *BI, BoUpSLP &R) -> bool {		[this](Instruction *I, BoUpSLP &R) -> bool {
return tryToVectorize(BI, R);		return tryToVectorize(I, R);
});		});
}		}

bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {		bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;		bool Changed = false;
SmallVector<Value *, 4> Incoming;		SmallVector<Value *, 4> Incoming;
SmallSet<Value *, 16> VisitedInstrs;		SmallSet<Value *, 16> VisitedInstrs;

▲ Show 20 Lines • Show All 92 Lines • ▼ Show 20 Lines	if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
it = BB->begin();		it = BB->begin();
e = BB->end();		e = BB->end();
continue;		continue;
}		}
}		}
}		}

// Try to vectorize trees that start at compare instructions.		// Try to vectorize trees that start at compare instructions.
if (CmpInst *CI = dyn_cast<CmpInst>(it)) {		if (auto *BI = dyn_cast<BranchInst>(it)) {
if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {		if (!BI->isConditional())
Changed = true;
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
it = BB->begin();
e = BB->end();
continue;		continue;
}

for (int I = 0; I < 2; ++I) {		if (vectorizeRootInstruction(nullptr, BI->getCondition(), BB, R, TTI)) {
if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
Changed = true;		Changed = true;
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
it = BB->begin();		it = BB->begin();
e = BB->end();		e = BB->end();
break;
}
}
continue;		continue;
}		}
		}

// Try to vectorize trees that start at insertelement instructions.		// Try to vectorize trees that start at insertelement instructions.
if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {		if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
SmallVector<Value *, 16> BuildVector;		SmallVector<Value *, 16> BuildVector;
SmallVector<Value *, 16> BuildVectorOpds;		SmallVector<Value *, 16> BuildVectorOpds;
if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))		if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
continue;		continue;

▲ Show 20 Lines • Show All 157 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/horizontal-list.ll

Show First 20 Lines • Show All 226 Lines • ▼ Show 20 Lines	entry:
%12 = fmul fast float %conv, %11		%12 = fmul fast float %conv, %11
%conv4 = fptosi float %12 to i32		%conv4 = fptosi float %12 to i32
store i32 %conv4, i32* @n, align 4		store i32 %conv4, i32* @n, align 4
ret i32 %conv4		ret i32 %conv4
}		}

define float @bar() {		define float @bar() {
; CHECK-LABEL: @bar(		; CHECK-LABEL: @bar(
; CHECK: [[TMP0:%.]] = load <2 x float>, <2 x float> bitcast ([20 x float]* @arr to <2 x float>*), align 16		; CHECK: [[TMP0:%.]] = load <4 x float>, <4 x float> bitcast ([20 x float]* @arr to <4 x float>*), align 16
; CHECK-NEXT: [[TMP1:%.]] = load <2 x float>, <2 x float> bitcast ([20 x float]* @arr1 to <2 x float>*), align 16		; CHECK-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]		; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0		; CHECK: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1		; CHECK-NEXT: [[TMP7:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]]
; CHECK-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]		; CHECK-NEXT: [[BIN_RDX:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]]
; CHECK-NEXT: [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]		; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP5:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8		; CHECK-NEXT: [[TMP8:%.*]] = fcmp fast ogt <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8		; CHECK-NEXT: [[BIN_RDX2:%.*]] = select <4 x i1> [[TMP8]], <4 x float> [[BIN_RDX]], <4 x float> [[RDX_SHUF1]]
; CHECK-NEXT: [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]		; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
; CHECK-NEXT: [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]		; CHECK: store float [[TMP9]], float* @res, align 4
; CHECK-NEXT: [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]		; CHECK-NEXT: ret float [[TMP9]]
; CHECK-NEXT: [[TMP7:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
; CHECK-NEXT: [[TMP8:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
; CHECK-NEXT: [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
; CHECK-NEXT: [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
; CHECK-NEXT: store float [[MAX_0_MUL3_2]], float* @res, align 4
; CHECK-NEXT: ret float [[MAX_0_MUL3_2]]
;		;
entry:		entry:
%0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16		%0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
%1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16		%1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
%mul = fmul fast float %1, %0		%mul = fmul fast float %1, %0
%2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4		%2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
%3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4		%3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
%mul3 = fmul fast float %3, %2		%mul3 = fmul fast float %3, %2
Show All 16 Lines

test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

Show All 28 Lines
; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]		; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]		; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
; CHECK-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4		; CHECK-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]		; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]		; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
; CHECK-NEXT: ret i32 [[TMP23]]		; CHECK-NEXT: ret i32 [[TMP23]]
;		;
; AVX-LABEL: @maxi8(		; AVX-LABEL: @maxi8(
; AVX-NEXT: [[TMP2:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
; AVX-NEXT: [[TMP3:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		; AVX: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]		; AVX-NEXT: [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]		; AVX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
; AVX-NEXT: [[TMP6:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]		; AVX-NEXT: [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]		; AVX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]]
; AVX-NEXT: [[TMP9:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4		; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]		; AVX-NEXT: [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]		; AVX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]]
; AVX-NEXT: [[TMP12:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16		; AVX-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
; AVX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]		; AVX: ret i32 [[TMP27]]
; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; AVX-NEXT: [[TMP15:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
; AVX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
; AVX-NEXT: [[TMP18:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; AVX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
; AVX-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; AVX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
; AVX-NEXT: ret i32 [[TMP23]]
;		;
; AVX2-LABEL: @maxi8(		; AVX2-LABEL: @maxi8(
; AVX2-NEXT: [[TMP2:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		; AVX2-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
; AVX2-NEXT: [[TMP3:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		; AVX2: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]		; AVX2-NEXT: [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]		; AVX2-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
; AVX2-NEXT: [[TMP6:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]		; AVX2-NEXT: [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]		; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP9:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4		; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]		; AVX2-NEXT: [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]		; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP12:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16		; AVX2-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]		; AVX2: ret i32 [[TMP27]]
; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; AVX2-NEXT: [[TMP15:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
; AVX2-NEXT: [[TMP18:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; AVX2-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
; AVX2-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
; AVX2-NEXT: ret i32 [[TMP23]]
;		;
; SKX-LABEL: @maxi8(		; SKX-LABEL: @maxi8(
; SKX-NEXT: [[TMP2:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		; SKX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
; SKX-NEXT: [[TMP3:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		; SKX: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]		; SKX-NEXT: [[TMP24:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]		; SKX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
; SKX-NEXT: [[TMP6:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]		; SKX-NEXT: [[TMP25:%.*]] = icmp sgt <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]		; SKX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x i32> [[BIN_RDX]], <8 x i32> [[RDX_SHUF1]]
; SKX-NEXT: [[TMP9:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4		; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]		; SKX-NEXT: [[TMP26:%.*]] = icmp sgt <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]		; SKX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x i32> [[BIN_RDX2]], <8 x i32> [[RDX_SHUF3]]
; SKX-NEXT: [[TMP12:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16		; SKX-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
; SKX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]		; SKX: ret i32 [[TMP27]]
; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; SKX-NEXT: [[TMP15:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
; SKX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
; SKX-NEXT: [[TMP18:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; SKX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
; SKX-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; SKX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
; SKX-NEXT: ret i32 [[TMP23]]
;		;
%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
%4 = icmp sgt i32 %2, %3		%4 = icmp sgt i32 %2, %3
%5 = select i1 %4, i32 %2, i32 %3		%5 = select i1 %4, i32 %2, i32 %3
%6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		%6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
%7 = icmp sgt i32 %5, %6		%7 = icmp sgt i32 %5, %6
%8 = select i1 %7, i32 %5, i32 %6		%8 = select i1 %7, i32 %5, i32 %6
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines
; CHECK-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]		; CHECK-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]		; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
; CHECK-NEXT: [[TMP45:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4		; CHECK-NEXT: [[TMP45:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]		; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]		; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
; CHECK-NEXT: ret i32 [[TMP47]]		; CHECK-NEXT: ret i32 [[TMP47]]
;		;
; AVX-LABEL: @maxi16(		; AVX-LABEL: @maxi16(
; AVX-NEXT: [[TMP2:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		; AVX-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
; AVX-NEXT: [[TMP3:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		; AVX: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]		; AVX-NEXT: [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]		; AVX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
; AVX-NEXT: [[TMP6:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]		; AVX-NEXT: [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]		; AVX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]]
; AVX-NEXT: [[TMP9:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4		; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]		; AVX-NEXT: [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]		; AVX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]]
; AVX-NEXT: [[TMP12:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16		; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]		; AVX-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]		; AVX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]]
; AVX-NEXT: [[TMP15:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4		; AVX-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
; AVX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]		; AVX: ret i32 [[TMP52]]
; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
; AVX-NEXT: [[TMP18:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; AVX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
; AVX-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; AVX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
; AVX-NEXT: [[TMP24:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
; AVX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
; AVX-NEXT: [[TMP27:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
; AVX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
; AVX-NEXT: [[TMP30:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
; AVX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
; AVX-NEXT: [[TMP33:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
; AVX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
; AVX-NEXT: [[TMP36:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
; AVX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
; AVX-NEXT: [[TMP39:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
; AVX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
; AVX-NEXT: [[TMP42:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
; AVX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
; AVX-NEXT: [[TMP45:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
; AVX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
; AVX-NEXT: ret i32 [[TMP47]]
;		;
; AVX2-LABEL: @maxi16(		; AVX2-LABEL: @maxi16(
; AVX2-NEXT: [[TMP2:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		; AVX2-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
; AVX2-NEXT: [[TMP3:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		; AVX2: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]		; AVX2-NEXT: [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]		; AVX2-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
; AVX2-NEXT: [[TMP6:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]		; AVX2-NEXT: [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]		; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP9:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4		; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]		; AVX2-NEXT: [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]		; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP12:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16		; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]		; AVX2-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]		; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]]
; AVX2-NEXT: [[TMP15:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4		; AVX2-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]		; AVX2: ret i32 [[TMP52]]
; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
; AVX2-NEXT: [[TMP18:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; AVX2-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
; AVX2-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
; AVX2-NEXT: [[TMP24:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
; AVX2-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
; AVX2-NEXT: [[TMP27:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
; AVX2-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
; AVX2-NEXT: [[TMP30:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
; AVX2-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
; AVX2-NEXT: [[TMP33:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
; AVX2-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
; AVX2-NEXT: [[TMP36:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
; AVX2-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
; AVX2-NEXT: [[TMP39:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
; AVX2-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
; AVX2-NEXT: [[TMP42:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
; AVX2-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
; AVX2-NEXT: [[TMP45:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
; AVX2-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
; AVX2-NEXT: ret i32 [[TMP47]]
;		;
; SKX-LABEL: @maxi16(		; SKX-LABEL: @maxi16(
; SKX-NEXT: [[TMP2:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		; SKX-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
; SKX-NEXT: [[TMP3:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		; SKX: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]		; SKX-NEXT: [[TMP48:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]		; SKX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
; SKX-NEXT: [[TMP6:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]		; SKX-NEXT: [[TMP49:%.*]] = icmp sgt <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]		; SKX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x i32> [[BIN_RDX]], <16 x i32> [[RDX_SHUF1]]
; SKX-NEXT: [[TMP9:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4		; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]		; SKX-NEXT: [[TMP50:%.*]] = icmp sgt <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]		; SKX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x i32> [[BIN_RDX2]], <16 x i32> [[RDX_SHUF3]]
; SKX-NEXT: [[TMP12:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16		; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]		; SKX-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]		; SKX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x i32> [[BIN_RDX4]], <16 x i32> [[RDX_SHUF5]]
; SKX-NEXT: [[TMP15:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4		; SKX-NEXT: [[TMP52:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
; SKX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]		; SKX: ret i32 [[TMP52]]
; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
; SKX-NEXT: [[TMP18:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; SKX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
; SKX-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; SKX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
; SKX-NEXT: [[TMP24:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
; SKX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
; SKX-NEXT: [[TMP27:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
; SKX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
; SKX-NEXT: [[TMP30:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
; SKX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
; SKX-NEXT: [[TMP33:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
; SKX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
; SKX-NEXT: [[TMP36:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
; SKX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
; SKX-NEXT: [[TMP39:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
; SKX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
; SKX-NEXT: [[TMP42:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
; SKX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
; SKX-NEXT: [[TMP45:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
; SKX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
; SKX-NEXT: ret i32 [[TMP47]]
;		;
%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
%4 = icmp sgt i32 %2, %3		%4 = icmp sgt i32 %2, %3
%5 = select i1 %4, i32 %2, i32 %3		%5 = select i1 %4, i32 %2, i32 %3
%6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		%6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
%7 = icmp sgt i32 %5, %6		%7 = icmp sgt i32 %5, %6
%8 = select i1 %7, i32 %5, i32 %6		%8 = select i1 %7, i32 %5, i32 %6
Show All 36 Lines	;
%45 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4		%45 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
%46 = icmp sgt i32 %44, %45		%46 = icmp sgt i32 %44, %45
%47 = select i1 %46, i32 %44, i32 %45		%47 = select i1 %46, i32 %44, i32 %45
ret i32 %47		ret i32 %47
}		}

define i32 @maxi32(i32) {		define i32 @maxi32(i32) {
; CHECK-LABEL: @maxi32(		; CHECK-LABEL: @maxi32(
; CHECK-NEXT: [[TMP2:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		; CHECK-NEXT: [[TMP2:%.]] = load <32 x i32>, <32 x i32> bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
; CHECK-NEXT: [[TMP3:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		; CHECK: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]		; CHECK-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]		; CHECK-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
; CHECK-NEXT: [[TMP6:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]		; CHECK-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]		; CHECK-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]]
; CHECK-NEXT: [[TMP9:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4		; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]		; CHECK-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]		; CHECK-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]]
; CHECK-NEXT: [[TMP12:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16		; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]		; CHECK-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]		; CHECK-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]]
; CHECK-NEXT: [[TMP15:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4		; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]		; CHECK-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]		; CHECK-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]]
; CHECK-NEXT: [[TMP18:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8		; CHECK-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]		; CHECK: ret i32 [[TMP101]]
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
; CHECK-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
; CHECK-NEXT: [[TMP24:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
; CHECK-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
; CHECK-NEXT: [[TMP27:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
; CHECK-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
; CHECK-NEXT: [[TMP30:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
; CHECK-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
; CHECK-NEXT: [[TMP33:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
; CHECK-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
; CHECK-NEXT: [[TMP36:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
; CHECK-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
; CHECK-NEXT: [[TMP39:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
; CHECK-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
; CHECK-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
; CHECK-NEXT: [[TMP42:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
; CHECK-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
; CHECK-NEXT: [[TMP45:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
; CHECK-NEXT: [[TMP48:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
; CHECK-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
; CHECK-NEXT: [[TMP51:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
; CHECK-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
; CHECK-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
; CHECK-NEXT: [[TMP54:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
; CHECK-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
; CHECK-NEXT: [[TMP57:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
; CHECK-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
; CHECK-NEXT: [[TMP60:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
; CHECK-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
; CHECK-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
; CHECK-NEXT: [[TMP63:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
; CHECK-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
; CHECK-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
; CHECK-NEXT: [[TMP66:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
; CHECK-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
; CHECK-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
; CHECK-NEXT: [[TMP69:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
; CHECK-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
; CHECK-NEXT: [[TMP72:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
; CHECK-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
; CHECK-NEXT: [[TMP75:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
; CHECK-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
; CHECK-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
; CHECK-NEXT: [[TMP78:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
; CHECK-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
; CHECK-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
; CHECK-NEXT: [[TMP81:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
; CHECK-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
; CHECK-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
; CHECK-NEXT: [[TMP84:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
; CHECK-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
; CHECK-NEXT: [[TMP87:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
; CHECK-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
; CHECK-NEXT: [[TMP90:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
; CHECK-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
; CHECK-NEXT: [[TMP93:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
; CHECK-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
; CHECK-NEXT: ret i32 [[TMP95]]
;		;
; AVX-LABEL: @maxi32(		; AVX-LABEL: @maxi32(
; AVX-NEXT: [[TMP2:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		; AVX-NEXT: [[TMP2:%.]] = load <32 x i32>, <32 x i32> bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
; AVX-NEXT: [[TMP3:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		; AVX: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]		; AVX-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]		; AVX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
; AVX-NEXT: [[TMP6:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]		; AVX-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]		; AVX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]]
; AVX-NEXT: [[TMP9:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4		; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]		; AVX-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]		; AVX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]]
; AVX-NEXT: [[TMP12:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16		; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]		; AVX-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]		; AVX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]]
; AVX-NEXT: [[TMP15:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4		; AVX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]		; AVX-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]		; AVX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]]
; AVX-NEXT: [[TMP18:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8		; AVX-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
; AVX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]		; AVX: ret i32 [[TMP101]]
; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
; AVX-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; AVX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
; AVX-NEXT: [[TMP24:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
; AVX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
; AVX-NEXT: [[TMP27:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
; AVX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
; AVX-NEXT: [[TMP30:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
; AVX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
; AVX-NEXT: [[TMP33:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
; AVX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
; AVX-NEXT: [[TMP36:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
; AVX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
; AVX-NEXT: [[TMP39:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
; AVX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
; AVX-NEXT: [[TMP42:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
; AVX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
; AVX-NEXT: [[TMP45:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
; AVX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
; AVX-NEXT: [[TMP48:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
; AVX-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
; AVX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
; AVX-NEXT: [[TMP51:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
; AVX-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
; AVX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
; AVX-NEXT: [[TMP54:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
; AVX-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
; AVX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
; AVX-NEXT: [[TMP57:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
; AVX-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
; AVX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
; AVX-NEXT: [[TMP60:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
; AVX-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
; AVX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
; AVX-NEXT: [[TMP63:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
; AVX-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
; AVX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
; AVX-NEXT: [[TMP66:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
; AVX-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
; AVX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
; AVX-NEXT: [[TMP69:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
; AVX-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
; AVX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
; AVX-NEXT: [[TMP72:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
; AVX-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
; AVX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
; AVX-NEXT: [[TMP75:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
; AVX-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
; AVX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
; AVX-NEXT: [[TMP78:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
; AVX-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
; AVX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
; AVX-NEXT: [[TMP81:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
; AVX-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
; AVX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
; AVX-NEXT: [[TMP84:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
; AVX-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
; AVX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
; AVX-NEXT: [[TMP87:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
; AVX-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
; AVX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
; AVX-NEXT: [[TMP90:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
; AVX-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
; AVX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
; AVX-NEXT: [[TMP93:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
; AVX-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
; AVX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
; AVX-NEXT: ret i32 [[TMP95]]
;		;
; AVX2-LABEL: @maxi32(		; AVX2-LABEL: @maxi32(
; AVX2-NEXT: [[TMP2:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		; AVX2-NEXT: [[TMP2:%.]] = load <32 x i32>, <32 x i32> bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
; AVX2-NEXT: [[TMP3:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		; AVX2: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]		; AVX2-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]		; AVX2-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
; AVX2-NEXT: [[TMP6:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]		; AVX2-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]		; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP9:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4		; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]		; AVX2-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]		; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP12:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16		; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]		; AVX2-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]		; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]]
; AVX2-NEXT: [[TMP15:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4		; AVX2-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]		; AVX2-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]		; AVX2-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]]
; AVX2-NEXT: [[TMP18:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8		; AVX2-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
; AVX2-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]		; AVX2: ret i32 [[TMP101]]
; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
; AVX2-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
; AVX2-NEXT: [[TMP24:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
; AVX2-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
; AVX2-NEXT: [[TMP27:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
; AVX2-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
; AVX2-NEXT: [[TMP30:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
; AVX2-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
; AVX2-NEXT: [[TMP33:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
; AVX2-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
; AVX2-NEXT: [[TMP36:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
; AVX2-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
; AVX2-NEXT: [[TMP39:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
; AVX2-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
; AVX2-NEXT: [[TMP42:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
; AVX2-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
; AVX2-NEXT: [[TMP45:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
; AVX2-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
; AVX2-NEXT: [[TMP48:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
; AVX2-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
; AVX2-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
; AVX2-NEXT: [[TMP51:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
; AVX2-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
; AVX2-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
; AVX2-NEXT: [[TMP54:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
; AVX2-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
; AVX2-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
; AVX2-NEXT: [[TMP57:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
; AVX2-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
; AVX2-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
; AVX2-NEXT: [[TMP60:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
; AVX2-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
; AVX2-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
; AVX2-NEXT: [[TMP63:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
; AVX2-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
; AVX2-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
; AVX2-NEXT: [[TMP66:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
; AVX2-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
; AVX2-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
; AVX2-NEXT: [[TMP69:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
; AVX2-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
; AVX2-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
; AVX2-NEXT: [[TMP72:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
; AVX2-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
; AVX2-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
; AVX2-NEXT: [[TMP75:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
; AVX2-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
; AVX2-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
; AVX2-NEXT: [[TMP78:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
; AVX2-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
; AVX2-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
; AVX2-NEXT: [[TMP81:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
; AVX2-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
; AVX2-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
; AVX2-NEXT: [[TMP84:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
; AVX2-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
; AVX2-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
; AVX2-NEXT: [[TMP87:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
; AVX2-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
; AVX2-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
; AVX2-NEXT: [[TMP90:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
; AVX2-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
; AVX2-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
; AVX2-NEXT: [[TMP93:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
; AVX2-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
; AVX2-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
; AVX2-NEXT: ret i32 [[TMP95]]
;		;
; SKX-LABEL: @maxi32(		; SKX-LABEL: @maxi32(
; SKX-NEXT: [[TMP2:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		; SKX-NEXT: [[TMP2:%.]] = load <32 x i32>, <32 x i32> bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
; SKX-NEXT: [[TMP3:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		; SKX: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]		; SKX-NEXT: [[TMP96:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]		; SKX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
; SKX-NEXT: [[TMP6:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]		; SKX-NEXT: [[TMP97:%.*]] = icmp sgt <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]		; SKX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x i32> [[BIN_RDX]], <32 x i32> [[RDX_SHUF1]]
; SKX-NEXT: [[TMP9:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4		; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]		; SKX-NEXT: [[TMP98:%.*]] = icmp sgt <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]		; SKX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x i32> [[BIN_RDX2]], <32 x i32> [[RDX_SHUF3]]
; SKX-NEXT: [[TMP12:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16		; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]		; SKX-NEXT: [[TMP99:%.*]] = icmp sgt <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]		; SKX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x i32> [[BIN_RDX4]], <32 x i32> [[RDX_SHUF5]]
; SKX-NEXT: [[TMP15:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4		; SKX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]		; SKX-NEXT: [[TMP100:%.*]] = icmp sgt <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]		; SKX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x i32> [[BIN_RDX6]], <32 x i32> [[RDX_SHUF7]]
; SKX-NEXT: [[TMP18:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8		; SKX-NEXT: [[TMP101:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
; SKX-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]		; SKX: ret i32 [[TMP101]]
; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
; SKX-NEXT: [[TMP21:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; SKX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
; SKX-NEXT: [[TMP24:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
; SKX-NEXT: [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
; SKX-NEXT: [[TMP27:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
; SKX-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
; SKX-NEXT: [[TMP30:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
; SKX-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
; SKX-NEXT: [[TMP33:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
; SKX-NEXT: [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
; SKX-NEXT: [[TMP36:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
; SKX-NEXT: [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
; SKX-NEXT: [[TMP39:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
; SKX-NEXT: [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
; SKX-NEXT: [[TMP42:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
; SKX-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
; SKX-NEXT: [[TMP45:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
; SKX-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
; SKX-NEXT: [[TMP48:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
; SKX-NEXT: [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
; SKX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
; SKX-NEXT: [[TMP51:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
; SKX-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
; SKX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
; SKX-NEXT: [[TMP54:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
; SKX-NEXT: [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
; SKX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
; SKX-NEXT: [[TMP57:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
; SKX-NEXT: [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
; SKX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
; SKX-NEXT: [[TMP60:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
; SKX-NEXT: [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
; SKX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
; SKX-NEXT: [[TMP63:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
; SKX-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
; SKX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
; SKX-NEXT: [[TMP66:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
; SKX-NEXT: [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
; SKX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
; SKX-NEXT: [[TMP69:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
; SKX-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
; SKX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
; SKX-NEXT: [[TMP72:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
; SKX-NEXT: [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
; SKX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
; SKX-NEXT: [[TMP75:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
; SKX-NEXT: [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
; SKX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
; SKX-NEXT: [[TMP78:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
; SKX-NEXT: [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
; SKX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
; SKX-NEXT: [[TMP81:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
; SKX-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
; SKX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
; SKX-NEXT: [[TMP84:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
; SKX-NEXT: [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
; SKX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
; SKX-NEXT: [[TMP87:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
; SKX-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
; SKX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
; SKX-NEXT: [[TMP90:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
; SKX-NEXT: [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
; SKX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
; SKX-NEXT: [[TMP93:%.]] = load i32, i32 getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
; SKX-NEXT: [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
; SKX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
; SKX-NEXT: ret i32 [[TMP95]]
;		;
%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16		%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4		%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
%4 = icmp sgt i32 %2, %3		%4 = icmp sgt i32 %2, %3
%5 = select i1 %4, i32 %2, i32 %3		%5 = select i1 %4, i32 %2, i32 %3
%6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8		%6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
%7 = icmp sgt i32 %5, %6		%7 = icmp sgt i32 %5, %6
%8 = select i1 %7, i32 %5, i32 %6		%8 = select i1 %7, i32 %5, i32 %6
▲ Show 20 Lines • Show All 109 Lines • ▼ Show 20 Lines
; CHECK-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]		; CHECK-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]		; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
; CHECK-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4		; CHECK-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
; CHECK-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]		; CHECK-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]		; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
; CHECK-NEXT: ret float [[TMP23]]		; CHECK-NEXT: ret float [[TMP23]]
;		;
; AVX-LABEL: @maxf8(		; AVX-LABEL: @maxf8(
; AVX-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		; AVX-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
; AVX-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		; AVX: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]		; AVX-NEXT: [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]		; AVX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
; AVX-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]		; AVX-NEXT: [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]		; AVX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]]
; AVX-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4		; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]		; AVX-NEXT: [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]		; AVX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]]
; AVX-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16		; AVX-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; AVX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]		; AVX: ret float [[TMP27]]
; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
; AVX-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
; AVX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
; AVX-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
; AVX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
; AVX-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
; AVX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
; AVX-NEXT: ret float [[TMP23]]
;		;
; AVX2-LABEL: @maxf8(		; AVX2-LABEL: @maxf8(
; AVX2-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		; AVX2-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
; AVX2-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		; AVX2: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]		; AVX2-NEXT: [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]		; AVX2-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
; AVX2-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]		; AVX2-NEXT: [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]		; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4		; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]		; AVX2-NEXT: [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]		; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16		; AVX2-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; AVX2-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]		; AVX2: ret float [[TMP27]]
; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
; AVX2-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
; AVX2-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
; AVX2-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
; AVX2-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
; AVX2-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
; AVX2-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
; AVX2-NEXT: ret float [[TMP23]]
;		;
; SKX-LABEL: @maxf8(		; SKX-LABEL: @maxf8(
; SKX-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		; SKX-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
; SKX-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		; SKX: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]		; SKX-NEXT: [[TMP24:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]		; SKX-NEXT: [[BIN_RDX:%.*]] = select <8 x i1> [[TMP24]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
; SKX-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]		; SKX-NEXT: [[TMP25:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]		; SKX-NEXT: [[BIN_RDX2:%.*]] = select <8 x i1> [[TMP25]], <8 x float> [[BIN_RDX]], <8 x float> [[RDX_SHUF1]]
; SKX-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4		; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]		; SKX-NEXT: [[TMP26:%.*]] = fcmp fast ogt <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]		; SKX-NEXT: [[BIN_RDX4:%.*]] = select <8 x i1> [[TMP26]], <8 x float> [[BIN_RDX2]], <8 x float> [[RDX_SHUF3]]
; SKX-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16		; SKX-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; SKX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]		; SKX: ret float [[TMP27]]
; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
; SKX-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
; SKX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
; SKX-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
; SKX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
; SKX-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
; SKX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
; SKX-NEXT: ret float [[TMP23]]
;		;
%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
%4 = fcmp fast ogt float %2, %3		%4 = fcmp fast ogt float %2, %3
%5 = select i1 %4, float %2, float %3		%5 = select i1 %4, float %2, float %3
%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
%7 = fcmp fast ogt float %5, %6		%7 = fcmp fast ogt float %5, %6
%8 = select i1 %7, float %5, float %6		%8 = select i1 %7, float %5, float %6
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines
; CHECK-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]		; CHECK-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]		; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
; CHECK-NEXT: [[TMP45:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4		; CHECK-NEXT: [[TMP45:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
; CHECK-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]		; CHECK-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]		; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
; CHECK-NEXT: ret float [[TMP47]]		; CHECK-NEXT: ret float [[TMP47]]
;		;
; AVX-LABEL: @maxf16(		; AVX-LABEL: @maxf16(
; AVX-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		; AVX-NEXT: [[TMP2:%.]] = load <16 x float>, <16 x float> bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
; AVX-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		; AVX: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]		; AVX-NEXT: [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]		; AVX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
; AVX-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]		; AVX-NEXT: [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]		; AVX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]]
; AVX-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4		; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]		; AVX-NEXT: [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]		; AVX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]]
; AVX-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16		; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]		; AVX-NEXT: [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]		; AVX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]]
; AVX-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4		; AVX-NEXT: [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
; AVX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]		; AVX: ret float [[TMP52]]
; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
; AVX-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
; AVX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
; AVX-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
; AVX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
; AVX-NEXT: [[TMP24:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
; AVX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
; AVX-NEXT: [[TMP27:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
; AVX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
; AVX-NEXT: [[TMP30:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
; AVX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
; AVX-NEXT: [[TMP33:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
; AVX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
; AVX-NEXT: [[TMP36:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
; AVX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
; AVX-NEXT: [[TMP39:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
; AVX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
; AVX-NEXT: [[TMP42:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
; AVX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
; AVX-NEXT: [[TMP45:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
; AVX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
; AVX-NEXT: ret float [[TMP47]]
;		;
; AVX2-LABEL: @maxf16(		; AVX2-LABEL: @maxf16(
; AVX2-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		; AVX2-NEXT: [[TMP2:%.]] = load <16 x float>, <16 x float> bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
; AVX2-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		; AVX2: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]		; AVX2-NEXT: [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]		; AVX2-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
; AVX2-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]		; AVX2-NEXT: [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]		; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4		; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]		; AVX2-NEXT: [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]		; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16		; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]		; AVX2-NEXT: [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]		; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]]
; AVX2-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4		; AVX2-NEXT: [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
; AVX2-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]		; AVX2: ret float [[TMP52]]
; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
; AVX2-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
; AVX2-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
; AVX2-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
; AVX2-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
; AVX2-NEXT: [[TMP24:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
; AVX2-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
; AVX2-NEXT: [[TMP27:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
; AVX2-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
; AVX2-NEXT: [[TMP30:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
; AVX2-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
; AVX2-NEXT: [[TMP33:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
; AVX2-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
; AVX2-NEXT: [[TMP36:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
; AVX2-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
; AVX2-NEXT: [[TMP39:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
; AVX2-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
; AVX2-NEXT: [[TMP42:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
; AVX2-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
; AVX2-NEXT: [[TMP45:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
; AVX2-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
; AVX2-NEXT: ret float [[TMP47]]
;		;
; SKX-LABEL: @maxf16(		; SKX-LABEL: @maxf16(
; SKX-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		; SKX-NEXT: [[TMP2:%.]] = load <16 x float>, <16 x float> bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
; SKX-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		; SKX: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]		; SKX-NEXT: [[TMP48:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]		; SKX-NEXT: [[BIN_RDX:%.*]] = select <16 x i1> [[TMP48]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
; SKX-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]		; SKX-NEXT: [[TMP49:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]		; SKX-NEXT: [[BIN_RDX2:%.*]] = select <16 x i1> [[TMP49]], <16 x float> [[BIN_RDX]], <16 x float> [[RDX_SHUF1]]
; SKX-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4		; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]		; SKX-NEXT: [[TMP50:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]		; SKX-NEXT: [[BIN_RDX4:%.*]] = select <16 x i1> [[TMP50]], <16 x float> [[BIN_RDX2]], <16 x float> [[RDX_SHUF3]]
; SKX-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16		; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]		; SKX-NEXT: [[TMP51:%.*]] = fcmp fast ogt <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]		; SKX-NEXT: [[BIN_RDX6:%.*]] = select <16 x i1> [[TMP51]], <16 x float> [[BIN_RDX4]], <16 x float> [[RDX_SHUF5]]
; SKX-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4		; SKX-NEXT: [[TMP52:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
; SKX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]		; SKX: ret float [[TMP52]]
; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
; SKX-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
; SKX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
; SKX-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
; SKX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
; SKX-NEXT: [[TMP24:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
; SKX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
; SKX-NEXT: [[TMP27:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
; SKX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
; SKX-NEXT: [[TMP30:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
; SKX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
; SKX-NEXT: [[TMP33:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
; SKX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
; SKX-NEXT: [[TMP36:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
; SKX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
; SKX-NEXT: [[TMP39:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
; SKX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
; SKX-NEXT: [[TMP42:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
; SKX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
; SKX-NEXT: [[TMP45:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
; SKX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
; SKX-NEXT: ret float [[TMP47]]
;		;
%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
%4 = fcmp fast ogt float %2, %3		%4 = fcmp fast ogt float %2, %3
%5 = select i1 %4, float %2, float %3		%5 = select i1 %4, float %2, float %3
%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
%7 = fcmp fast ogt float %5, %6		%7 = fcmp fast ogt float %5, %6
%8 = select i1 %7, float %5, float %6		%8 = select i1 %7, float %5, float %6
▲ Show 20 Lines • Show All 133 Lines • ▼ Show 20 Lines
; CHECK-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]		; CHECK-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]		; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
; CHECK-NEXT: [[TMP93:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4		; CHECK-NEXT: [[TMP93:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
; CHECK-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]		; CHECK-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]		; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
; CHECK-NEXT: ret float [[TMP95]]		; CHECK-NEXT: ret float [[TMP95]]
;		;
; AVX-LABEL: @maxf32(		; AVX-LABEL: @maxf32(
; AVX-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		; AVX-NEXT: [[TMP2:%.]] = load <32 x float>, <32 x float> bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
; AVX-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		; AVX: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]		; AVX-NEXT: [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]		; AVX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
; AVX-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]		; AVX-NEXT: [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]		; AVX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]]
; AVX-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4		; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]		; AVX-NEXT: [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]		; AVX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]]
; AVX-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16		; AVX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]		; AVX-NEXT: [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]		; AVX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]]
; AVX-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4		; AVX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]		; AVX-NEXT: [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
; AVX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]		; AVX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]]
; AVX-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8		; AVX-NEXT: [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
; AVX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]		; AVX: ret float [[TMP101]]
; AVX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
; AVX-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
; AVX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
; AVX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
; AVX-NEXT: [[TMP24:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
; AVX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
; AVX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
; AVX-NEXT: [[TMP27:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
; AVX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
; AVX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
; AVX-NEXT: [[TMP30:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
; AVX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
; AVX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
; AVX-NEXT: [[TMP33:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
; AVX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
; AVX-NEXT: [[TMP36:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
; AVX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
; AVX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
; AVX-NEXT: [[TMP39:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
; AVX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
; AVX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
; AVX-NEXT: [[TMP42:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
; AVX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
; AVX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
; AVX-NEXT: [[TMP45:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
; AVX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
; AVX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
; AVX-NEXT: [[TMP48:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
; AVX-NEXT: [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
; AVX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
; AVX-NEXT: [[TMP51:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
; AVX-NEXT: [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
; AVX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
; AVX-NEXT: [[TMP54:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
; AVX-NEXT: [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
; AVX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
; AVX-NEXT: [[TMP57:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
; AVX-NEXT: [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
; AVX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
; AVX-NEXT: [[TMP60:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
; AVX-NEXT: [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
; AVX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
; AVX-NEXT: [[TMP63:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
; AVX-NEXT: [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
; AVX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
; AVX-NEXT: [[TMP66:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
; AVX-NEXT: [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
; AVX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
; AVX-NEXT: [[TMP69:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
; AVX-NEXT: [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
; AVX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
; AVX-NEXT: [[TMP72:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
; AVX-NEXT: [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
; AVX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
; AVX-NEXT: [[TMP75:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
; AVX-NEXT: [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
; AVX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
; AVX-NEXT: [[TMP78:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
; AVX-NEXT: [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
; AVX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
; AVX-NEXT: [[TMP81:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
; AVX-NEXT: [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
; AVX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
; AVX-NEXT: [[TMP84:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
; AVX-NEXT: [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
; AVX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
; AVX-NEXT: [[TMP87:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
; AVX-NEXT: [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
; AVX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
; AVX-NEXT: [[TMP90:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
; AVX-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
; AVX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
; AVX-NEXT: [[TMP93:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
; AVX-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
; AVX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
; AVX-NEXT: ret float [[TMP95]]
;		;
; AVX2-LABEL: @maxf32(		; AVX2-LABEL: @maxf32(
; AVX2-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		; AVX2-NEXT: [[TMP2:%.]] = load <32 x float>, <32 x float> bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
; AVX2-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		; AVX2: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]		; AVX2-NEXT: [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]		; AVX2-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
; AVX2-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]		; AVX2-NEXT: [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]		; AVX2-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]]
; AVX2-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4		; AVX2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]		; AVX2-NEXT: [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]		; AVX2-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]]
; AVX2-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16		; AVX2-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]		; AVX2-NEXT: [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]		; AVX2-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]]
; AVX2-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4		; AVX2-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]		; AVX2-NEXT: [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
; AVX2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]		; AVX2-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]]
; AVX2-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8		; AVX2-NEXT: [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
; AVX2-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]		; AVX2: ret float [[TMP101]]
; AVX2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
; AVX2-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
; AVX2-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
; AVX2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
; AVX2-NEXT: [[TMP24:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
; AVX2-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
; AVX2-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
; AVX2-NEXT: [[TMP27:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
; AVX2-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
; AVX2-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
; AVX2-NEXT: [[TMP30:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
; AVX2-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
; AVX2-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
; AVX2-NEXT: [[TMP33:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
; AVX2-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
; AVX2-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
; AVX2-NEXT: [[TMP36:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
; AVX2-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
; AVX2-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
; AVX2-NEXT: [[TMP39:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
; AVX2-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
; AVX2-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
; AVX2-NEXT: [[TMP42:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
; AVX2-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
; AVX2-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
; AVX2-NEXT: [[TMP45:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
; AVX2-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
; AVX2-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
; AVX2-NEXT: [[TMP48:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
; AVX2-NEXT: [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
; AVX2-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
; AVX2-NEXT: [[TMP51:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
; AVX2-NEXT: [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
; AVX2-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
; AVX2-NEXT: [[TMP54:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
; AVX2-NEXT: [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
; AVX2-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
; AVX2-NEXT: [[TMP57:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
; AVX2-NEXT: [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
; AVX2-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
; AVX2-NEXT: [[TMP60:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
; AVX2-NEXT: [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
; AVX2-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
; AVX2-NEXT: [[TMP63:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
; AVX2-NEXT: [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
; AVX2-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
; AVX2-NEXT: [[TMP66:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
; AVX2-NEXT: [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
; AVX2-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
; AVX2-NEXT: [[TMP69:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
; AVX2-NEXT: [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
; AVX2-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
; AVX2-NEXT: [[TMP72:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
; AVX2-NEXT: [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
; AVX2-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
; AVX2-NEXT: [[TMP75:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
; AVX2-NEXT: [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
; AVX2-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
; AVX2-NEXT: [[TMP78:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
; AVX2-NEXT: [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
; AVX2-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
; AVX2-NEXT: [[TMP81:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
; AVX2-NEXT: [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
; AVX2-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
; AVX2-NEXT: [[TMP84:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
; AVX2-NEXT: [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
; AVX2-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
; AVX2-NEXT: [[TMP87:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
; AVX2-NEXT: [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
; AVX2-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
; AVX2-NEXT: [[TMP90:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
; AVX2-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
; AVX2-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
; AVX2-NEXT: [[TMP93:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
; AVX2-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
; AVX2-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
; AVX2-NEXT: ret float [[TMP95]]
;		;
; SKX-LABEL: @maxf32(		; SKX-LABEL: @maxf32(
; SKX-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		; SKX-NEXT: [[TMP2:%.]] = load <32 x float>, <32 x float> bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
; SKX-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		; SKX: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]		; SKX-NEXT: [[TMP96:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
; SKX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]		; SKX-NEXT: [[BIN_RDX:%.*]] = select <32 x i1> [[TMP96]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
; SKX-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]		; SKX-NEXT: [[TMP97:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; SKX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]		; SKX-NEXT: [[BIN_RDX2:%.*]] = select <32 x i1> [[TMP97]], <32 x float> [[BIN_RDX]], <32 x float> [[RDX_SHUF1]]
; SKX-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4		; SKX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]		; SKX-NEXT: [[TMP98:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]		; SKX-NEXT: [[BIN_RDX4:%.*]] = select <32 x i1> [[TMP98]], <32 x float> [[BIN_RDX2]], <32 x float> [[RDX_SHUF3]]
; SKX-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16		; SKX-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]		; SKX-NEXT: [[TMP99:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; SKX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]		; SKX-NEXT: [[BIN_RDX6:%.*]] = select <32 x i1> [[TMP99]], <32 x float> [[BIN_RDX4]], <32 x float> [[RDX_SHUF5]]
; SKX-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4		; SKX-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SKX-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]		; SKX-NEXT: [[TMP100:%.*]] = fcmp fast ogt <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
; SKX-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]		; SKX-NEXT: [[BIN_RDX8:%.*]] = select <32 x i1> [[TMP100]], <32 x float> [[BIN_RDX6]], <32 x float> [[RDX_SHUF7]]
; SKX-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8		; SKX-NEXT: [[TMP101:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
; SKX-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]		; SKX: ret float [[TMP101]]
; SKX-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
; SKX-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
; SKX-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
; SKX-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
; SKX-NEXT: [[TMP24:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
; SKX-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
; SKX-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
; SKX-NEXT: [[TMP27:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
; SKX-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
; SKX-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
; SKX-NEXT: [[TMP30:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
; SKX-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
; SKX-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
; SKX-NEXT: [[TMP33:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
; SKX-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
; SKX-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
; SKX-NEXT: [[TMP36:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
; SKX-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
; SKX-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
; SKX-NEXT: [[TMP39:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
; SKX-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
; SKX-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
; SKX-NEXT: [[TMP42:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
; SKX-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
; SKX-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
; SKX-NEXT: [[TMP45:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
; SKX-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
; SKX-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
; SKX-NEXT: [[TMP48:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
; SKX-NEXT: [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
; SKX-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
; SKX-NEXT: [[TMP51:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
; SKX-NEXT: [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
; SKX-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
; SKX-NEXT: [[TMP54:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
; SKX-NEXT: [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
; SKX-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
; SKX-NEXT: [[TMP57:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
; SKX-NEXT: [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
; SKX-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
; SKX-NEXT: [[TMP60:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
; SKX-NEXT: [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
; SKX-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
; SKX-NEXT: [[TMP63:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
; SKX-NEXT: [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
; SKX-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
; SKX-NEXT: [[TMP66:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
; SKX-NEXT: [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
; SKX-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
; SKX-NEXT: [[TMP69:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
; SKX-NEXT: [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
; SKX-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
; SKX-NEXT: [[TMP72:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
; SKX-NEXT: [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
; SKX-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
; SKX-NEXT: [[TMP75:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
; SKX-NEXT: [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
; SKX-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
; SKX-NEXT: [[TMP78:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
; SKX-NEXT: [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
; SKX-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
; SKX-NEXT: [[TMP81:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
; SKX-NEXT: [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
; SKX-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
; SKX-NEXT: [[TMP84:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
; SKX-NEXT: [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
; SKX-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
; SKX-NEXT: [[TMP87:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
; SKX-NEXT: [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
; SKX-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
; SKX-NEXT: [[TMP90:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
; SKX-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
; SKX-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
; SKX-NEXT: [[TMP93:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
; SKX-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
; SKX-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
; SKX-NEXT: ret float [[TMP95]]
;		;
%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
%4 = fcmp fast ogt float %2, %3		%4 = fcmp fast ogt float %2, %3
%5 = select i1 %4, float %2, float %3		%5 = select i1 %4, float %2, float %3
%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
%7 = fcmp fast ogt float %5, %6		%7 = fcmp fast ogt float %5, %6
%8 = select i1 %7, float %5, float %6		%8 = select i1 %7, float %5, float %6
▲ Show 20 Lines • Show All 90 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Support for horizontal min/max reduction
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 81744

include/llvm/Analysis/TargetTransformInfo.h

include/llvm/Analysis/TargetTransformInfoImpl.h

include/llvm/CodeGen/BasicTTIImpl.h

include/llvm/Transforms/Vectorize/SLPVectorizer.h

lib/Analysis/CostModel.cpp

lib/Analysis/TargetTransformInfo.cpp

lib/Target/X86/X86TargetTransformInfo.h

lib/Target/X86/X86TargetTransformInfo.cpp

lib/Transforms/Vectorize/SLPVectorizer.cpp

test/Transforms/SLPVectorizer/X86/horizontal-list.ll

test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Support for horizontal min/max reductionClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 81744

include/llvm/Analysis/TargetTransformInfo.h

include/llvm/Analysis/TargetTransformInfoImpl.h

include/llvm/CodeGen/BasicTTIImpl.h

include/llvm/Transforms/Vectorize/SLPVectorizer.h

lib/Analysis/CostModel.cpp

lib/Analysis/TargetTransformInfo.cpp

lib/Target/X86/X86TargetTransformInfo.h

lib/Target/X86/X86TargetTransformInfo.cpp

lib/Transforms/Vectorize/SLPVectorizer.cpp

test/Transforms/SLPVectorizer/X86/horizontal-list.ll

test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

[SLP] Support for horizontal min/max reduction
ClosedPublic