This is an archive of the discontinued LLVM Phabricator instance.

[SLPVectorizer][X86][AMDGPU] Remove fcmp+select to fmin/fmax reduction support.
ClosedPublic

Authored by craig.topper on Sep 9 2020, 2:13 PM.

Download Raw Diff

Details

Reviewers

RKSimon
spatel
nikic
arsenm
efriedma
vdmitrie
ABataev

Commits

rGc195ae2f0032: [SLPVectorizer][X86][AMDGPU] Remove fcmp+select to fmin/fmax reduction support.

Summary

Previously we could match fcmp+select to a reduction if the fcmp had
the nonans fast math flag. But if the select had the nonans fast
math flag, InstCombine would turn it into a fminnum/fmaxnum intrinsic
before SLP gets to it. Seems fairly likely that if one of the
fcmp+select pair have the fast math flag, they both would.

My plan is to start vectorizing the fmaxnum/fminnum version soon,
but I wanted to get this code out as it had some of the strangest
fast math flag behaviors.

Diff Detail

Event Timeline

craig.topper created this revision.Sep 9 2020, 2:13 PM

Herald added a project: Restricted Project. · View Herald TranscriptSep 9 2020, 2:13 PM

Herald added subscribers: kerbowa, hiraditya, t-tye and 6 others. · View Herald Transcript

craig.topper requested review of this revision.Sep 9 2020, 2:13 PM

Herald added a subscriber: wdng. · View Herald TranscriptSep 9 2020, 2:13 PM

craig.topper added a reviewer: ABataev.Sep 9 2020, 2:13 PM

LGTM

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
6442	Merge these 4 cases?

This revision is now accepted and ready to land.Sep 10 2020, 6:25 AM

This was committed in c195ae2f003261f2c25f569b07ae556dee57f17d

spatel mentioned this in D93860: [SLP] delete unused pairwise reduction option.Dec 28 2020, 8:12 AM

spatel mentioned this in rG3b8b2c7da2ef: [SLP] delete unused pairwise reduction option.Jan 5 2021, 10:39 AM

Revision Contents

Path

Size

llvm/

lib/

Transforms/

Vectorize/

SLPVectorizer.cpp

89 lines

test/

Transforms/

SLPVectorizer/

AMDGPU/

horizontal-store.ll

52 lines

reduction.ll

80 lines

X86/

horizontal-list.ll

52 lines

horizontal-minmax.ll

360 lines

Diff 290826

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,250 Lines • ▼ Show 20 Lines	class HorizontalReduction {
SmallVector<Value *, 32> ReducedVals;		SmallVector<Value *, 32> ReducedVals;
// Use map vector to make stable output.		// Use map vector to make stable output.
MapVector<Instruction , Value > ExtraArgs;		MapVector<Instruction , Value > ExtraArgs;

/// Kind of the reduction data.		/// Kind of the reduction data.
enum ReductionKind {		enum ReductionKind {
RK_None, /// Not a reduction.		RK_None, /// Not a reduction.
RK_Arithmetic, /// Binary reduction data.		RK_Arithmetic, /// Binary reduction data.
RK_Min, /// Minimum reduction data.		RK_SMin, /// Signed minimum reduction data.
RK_UMin, /// Unsigned minimum reduction data.		RK_UMin, /// Unsigned minimum reduction data.
RK_Max, /// Maximum reduction data.		RK_SMax, /// Signed maximum reduction data.
RK_UMax, /// Unsigned maximum reduction data.		RK_UMax, /// Unsigned maximum reduction data.
};		};

/// Contains info about operation, like its opcode, left and right operands.		/// Contains info about operation, like its opcode, left and right operands.
class OperationData {		class OperationData {
/// Opcode of the instruction.		/// Opcode of the instruction.
unsigned Opcode = 0;		unsigned Opcode = 0;

Show All 13 Lines	class OperationData {
bool isVectorizable() const {		bool isVectorizable() const {
return LHS && RHS &&		return LHS && RHS &&
// We currently only support add/mul/logical && min/max reductions.		// We currently only support add/mul/logical && min/max reductions.
((Kind == RK_Arithmetic &&		((Kind == RK_Arithmetic &&
(Opcode == Instruction::Add \|\| Opcode == Instruction::FAdd \|\|		(Opcode == Instruction::Add \|\| Opcode == Instruction::FAdd \|\|
Opcode == Instruction::Mul \|\| Opcode == Instruction::FMul \|\|		Opcode == Instruction::Mul \|\| Opcode == Instruction::FMul \|\|
Opcode == Instruction::And \|\| Opcode == Instruction::Or \|\|		Opcode == Instruction::And \|\| Opcode == Instruction::Or \|\|
Opcode == Instruction::Xor)) \|\|		Opcode == Instruction::Xor)) \|\|
((Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) &&		(Opcode == Instruction::ICmp &&
(Kind == RK_Min \|\| Kind == RK_Max)) \|\|		(Kind == RK_SMin \|\| Kind == RK_SMax)) \|\|
(Opcode == Instruction::ICmp &&		(Opcode == Instruction::ICmp &&
(Kind == RK_UMin \|\| Kind == RK_UMax)));		(Kind == RK_UMin \|\| Kind == RK_UMax)));
}		}

/// Creates reduction operation with the current opcode.		/// Creates reduction operation with the current opcode.
Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {		Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
assert(isVectorizable() &&		assert(isVectorizable() &&
"Expected add\|fadd or min/max reduction operation.");		"Expected add\|fadd or min/max reduction operation.");
Value *Cmp = nullptr;		Value *Cmp = nullptr;
switch (Kind) {		switch (Kind) {
case RK_Arithmetic:		case RK_Arithmetic:
return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,		return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
Name);		Name);
case RK_Min:		case RK_SMin:
Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)		assert(Opcode == Instruction::ICmp && "Expected integer types.");
: Builder.CreateFCmpOLT(LHS, RHS);		Cmp = Builder.CreateICmpSLT(LHS, RHS);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);		return Builder.CreateSelect(Cmp, LHS, RHS, Name);
case RK_Max:		case RK_SMax:
Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)		assert(Opcode == Instruction::ICmp && "Expected integer types.");
: Builder.CreateFCmpOGT(LHS, RHS);		Cmp = Builder.CreateICmpSGT(LHS, RHS);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);		return Builder.CreateSelect(Cmp, LHS, RHS, Name);
case RK_UMin:		case RK_UMin:
assert(Opcode == Instruction::ICmp && "Expected integer types.");		assert(Opcode == Instruction::ICmp && "Expected integer types.");
Cmp = Builder.CreateICmpULT(LHS, RHS);		Cmp = Builder.CreateICmpULT(LHS, RHS);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);		return Builder.CreateSelect(Cmp, LHS, RHS, Name);
case RK_UMax:		case RK_UMax:
assert(Opcode == Instruction::ICmp && "Expected integer types.");		assert(Opcode == Instruction::ICmp && "Expected integer types.");
Cmp = Builder.CreateICmpUGT(LHS, RHS);		Cmp = Builder.CreateICmpUGT(LHS, RHS);
Show All 24 Lines	public:

explicit operator bool() const { return Opcode; }		explicit operator bool() const { return Opcode; }

/// Return true if this operation is any kind of minimum or maximum.		/// Return true if this operation is any kind of minimum or maximum.
bool isMinMax() const {		bool isMinMax() const {
switch (Kind) {		switch (Kind) {
case RK_Arithmetic:		case RK_Arithmetic:
return false;		return false;
case RK_Min:		case RK_SMin:
case RK_Max:		case RK_SMax:
case RK_UMin:		case RK_UMin:
case RK_UMax:		case RK_UMax:
return true;		return true;
case RK_None:		case RK_None:
break;		break;
}		}
llvm_unreachable("Reduction kind is not set");		llvm_unreachable("Reduction kind is not set");
}		}
▲ Show 20 Lines • Show All 65 Lines • ▼ Show 20 Lines	public:

/// Checks if instruction is associative and can be vectorized.		/// Checks if instruction is associative and can be vectorized.
bool isAssociative(Instruction *I) const {		bool isAssociative(Instruction *I) const {
assert(Kind != RK_None && *this && LHS && RHS &&		assert(Kind != RK_None && *this && LHS && RHS &&
"Expected reduction operation.");		"Expected reduction operation.");
switch (Kind) {		switch (Kind) {
case RK_Arithmetic:		case RK_Arithmetic:
return I->isAssociative();		return I->isAssociative();
case RK_Min:		case RK_SMin:
case RK_Max:		case RK_SMax:
return Opcode == Instruction::ICmp \|\|		assert(Opcode == Instruction::ICmp &&
cast<Instruction>(I->getOperand(0))->isFast();		"Only integer compare operation is expected.");
		return true;
case RK_UMin:		case RK_UMin:
case RK_UMax:		case RK_UMax:
		RKSimonUnsubmitted Not Done Reply Inline Actions Merge these 4 cases? RKSimon: Merge these 4 cases?
assert(Opcode == Instruction::ICmp &&		assert(Opcode == Instruction::ICmp &&
"Only integer compare operation is expected.");		"Only integer compare operation is expected.");
return true;		return true;
case RK_None:		case RK_None:
break;		break;
}		}
llvm_unreachable("Reduction kind is not set");		llvm_unreachable("Reduction kind is not set");
}		}
Show All 39 Lines	Value *createOp(IRBuilder<> &Builder, const Twine &Name,
const ReductionOpsListType &ReductionOps) const {		const ReductionOpsListType &ReductionOps) const {
assert(isVectorizable() &&		assert(isVectorizable() &&
"Expected add\|fadd or min/max reduction operation.");		"Expected add\|fadd or min/max reduction operation.");
auto *Op = createOp(Builder, Name);		auto *Op = createOp(Builder, Name);
switch (Kind) {		switch (Kind) {
case RK_Arithmetic:		case RK_Arithmetic:
propagateIRFlags(Op, ReductionOps[0]);		propagateIRFlags(Op, ReductionOps[0]);
return Op;		return Op;
case RK_Min:		case RK_SMin:
case RK_Max:		case RK_SMax:
case RK_UMin:		case RK_UMin:
case RK_UMax:		case RK_UMax:
if (auto *SI = dyn_cast<SelectInst>(Op))		if (auto *SI = dyn_cast<SelectInst>(Op))
propagateIRFlags(SI->getCondition(), ReductionOps[0]);		propagateIRFlags(SI->getCondition(), ReductionOps[0]);
propagateIRFlags(Op, ReductionOps[1]);		propagateIRFlags(Op, ReductionOps[1]);
return Op;		return Op;
case RK_None:		case RK_None:
break;		break;
}		}
llvm_unreachable("Unknown reduction operation.");		llvm_unreachable("Unknown reduction operation.");
}		}
/// Creates reduction operation with the current opcode with the IR flags		/// Creates reduction operation with the current opcode with the IR flags
/// from \p I.		/// from \p I.
Value *createOp(IRBuilder<> &Builder, const Twine &Name,		Value *createOp(IRBuilder<> &Builder, const Twine &Name,
Instruction *I) const {		Instruction *I) const {
assert(isVectorizable() &&		assert(isVectorizable() &&
"Expected add\|fadd or min/max reduction operation.");		"Expected add\|fadd or min/max reduction operation.");
auto *Op = createOp(Builder, Name);		auto *Op = createOp(Builder, Name);
switch (Kind) {		switch (Kind) {
case RK_Arithmetic:		case RK_Arithmetic:
propagateIRFlags(Op, I);		propagateIRFlags(Op, I);
return Op;		return Op;
case RK_Min:		case RK_SMin:
case RK_Max:		case RK_SMax:
case RK_UMin:		case RK_UMin:
case RK_UMax:		case RK_UMax:
if (auto *SI = dyn_cast<SelectInst>(Op)) {		if (auto *SI = dyn_cast<SelectInst>(Op)) {
propagateIRFlags(SI->getCondition(),		propagateIRFlags(SI->getCondition(),
cast<SelectInst>(I)->getCondition());		cast<SelectInst>(I)->getCondition());
}		}
propagateIRFlags(Op, I);		propagateIRFlags(Op, I);
return Op;		return Op;
case RK_None:		case RK_None:
break;		break;
}		}
llvm_unreachable("Unknown reduction operation.");		llvm_unreachable("Unknown reduction operation.");
}		}

TargetTransformInfo::ReductionFlags getFlags() const {		TargetTransformInfo::ReductionFlags getFlags() const {
TargetTransformInfo::ReductionFlags Flags;		TargetTransformInfo::ReductionFlags Flags;
Flags.NoNaN = NoNaN;		Flags.NoNaN = NoNaN;
switch (Kind) {		switch (Kind) {
case RK_Arithmetic:		case RK_Arithmetic:
break;		break;
case RK_Min:		case RK_SMin:
Flags.IsSigned = Opcode == Instruction::ICmp;		Flags.IsSigned = true;
Flags.IsMaxOp = false;		Flags.IsMaxOp = false;
break;		break;
case RK_Max:		case RK_SMax:
Flags.IsSigned = Opcode == Instruction::ICmp;		Flags.IsSigned = true;
Flags.IsMaxOp = true;		Flags.IsMaxOp = true;
break;		break;
case RK_UMin:		case RK_UMin:
Flags.IsSigned = false;		Flags.IsSigned = false;
Flags.IsMaxOp = false;		Flags.IsMaxOp = false;
break;		break;
case RK_UMax:		case RK_UMax:
Flags.IsSigned = false;		Flags.IsSigned = false;
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines	if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,		return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
RK_Arithmetic);		RK_Arithmetic);
}		}
if (auto *Select = dyn_cast<SelectInst>(V)) {		if (auto *Select = dyn_cast<SelectInst>(V)) {
// Look for a min/max pattern.		// Look for a min/max pattern.
if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {		if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);		return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
} else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {		} else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);		return OperationData(Instruction::ICmp, LHS, RHS, RK_SMin);
} else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) \|\|
m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(
Instruction::FCmp, LHS, RHS, RK_Min,
cast<Instruction>(Select->getCondition())->hasNoNaNs());
} else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {		} else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);		return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
} else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {		} else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);		return OperationData(Instruction::ICmp, LHS, RHS, RK_SMax);
} else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) \|\|
m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(
Instruction::FCmp, LHS, RHS, RK_Max,
cast<Instruction>(Select->getCondition())->hasNoNaNs());
} else {		} else {
// Try harder: look for min/max pattern based on instructions producing		// Try harder: look for min/max pattern based on instructions producing
// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).		// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
// During the intermediate stages of SLP, it's very common to have		// During the intermediate stages of SLP, it's very common to have
// pattern like this (since optimizeGatherSequence is run only once		// pattern like this (since optimizeGatherSequence is run only once
// at the end):		// at the end):
// %1 = extractelement <2 x i32> %a, i32 0		// %1 = extractelement <2 x i32> %a, i32 0
// %2 = extractelement <2 x i32> %a, i32 1		// %2 = extractelement <2 x i32> %a, i32 1
Show All 31 Lines	if (auto *Select = dyn_cast<SelectInst>(V)) {
return OperationData(V);		return OperationData(V);

case CmpInst::ICMP_ULT:		case CmpInst::ICMP_ULT:
case CmpInst::ICMP_ULE:		case CmpInst::ICMP_ULE:
return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);		return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);

case CmpInst::ICMP_SLT:		case CmpInst::ICMP_SLT:
case CmpInst::ICMP_SLE:		case CmpInst::ICMP_SLE:
return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);		return OperationData(Instruction::ICmp, LHS, RHS, RK_SMin);

case CmpInst::FCMP_OLT:
case CmpInst::FCMP_OLE:
case CmpInst::FCMP_ULT:
case CmpInst::FCMP_ULE:
return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
cast<Instruction>(Cond)->hasNoNaNs());

case CmpInst::ICMP_UGT:		case CmpInst::ICMP_UGT:
case CmpInst::ICMP_UGE:		case CmpInst::ICMP_UGE:
return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);		return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);

case CmpInst::ICMP_SGT:		case CmpInst::ICMP_SGT:
case CmpInst::ICMP_SGE:		case CmpInst::ICMP_SGE:
return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);		return OperationData(Instruction::ICmp, LHS, RHS, RK_SMax);

case CmpInst::FCMP_OGT:
case CmpInst::FCMP_OGE:
case CmpInst::FCMP_UGT:
case CmpInst::FCMP_UGE:
return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
cast<Instruction>(Cond)->hasNoNaNs());
}		}
}		}
}		}
return OperationData(V);		return OperationData(V);
}		}

public:		public:
HorizontalReduction() = default;		HorizontalReduction() = default;
▲ Show 20 Lines • Show All 306 Lines • ▼ Show 20 Lines	int getReductionCost(TargetTransformInfo TTI, Value FirstReducedVal,
case RK_Arithmetic:		case RK_Arithmetic:
PairwiseRdxCost =		PairwiseRdxCost =
TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,		TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
/IsPairwiseForm=/true);		/IsPairwiseForm=/true);
SplittingRdxCost =		SplittingRdxCost =
TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,		TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
/IsPairwiseForm=/false);		/IsPairwiseForm=/false);
break;		break;
case RK_Min:		case RK_SMin:
case RK_Max:		case RK_SMax:
case RK_UMin:		case RK_UMin:
case RK_UMax: {		case RK_UMax: {
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));		auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
bool IsUnsigned = ReductionData.getKind() == RK_UMin \|\|		bool IsUnsigned = ReductionData.getKind() == RK_UMin \|\|
ReductionData.getKind() == RK_UMax;		ReductionData.getKind() == RK_UMax;
PairwiseRdxCost =		PairwiseRdxCost =
TTI->getMinMaxReductionCost(VecTy, VecCondTy,		TTI->getMinMaxReductionCost(VecTy, VecCondTy,
/IsPairwiseForm=/true, IsUnsigned);		/IsPairwiseForm=/true, IsUnsigned);
Show All 10 Lines	int getReductionCost(TargetTransformInfo TTI, Value FirstReducedVal,
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;		int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;

int ScalarReduxCost = 0;		int ScalarReduxCost = 0;
switch (ReductionData.getKind()) {		switch (ReductionData.getKind()) {
case RK_Arithmetic:		case RK_Arithmetic:
ScalarReduxCost =		ScalarReduxCost =
TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);		TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
break;		break;
case RK_Min:		case RK_SMin:
case RK_Max:		case RK_SMax:
case RK_UMin:		case RK_UMin:
case RK_UMax:		case RK_UMax:
ScalarReduxCost =		ScalarReduxCost =
TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +		TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,		TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
CmpInst::makeCmpResultType(ScalarTy));		CmpInst::makeCmpResultType(ScalarTy));
break;		break;
case RK_None:		case RK_None:
▲ Show 20 Lines • Show All 684 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll

Show First 20 Lines • Show All 101 Lines • ▼ Show 20 Lines	;
%cmp5 = icmp slt i64 %select4, %load6		%cmp5 = icmp slt i64 %select4, %load6
%select5 = select i1 %cmp5, i64 %select4, i64 %load6		%select5 = select i1 %cmp5, i64 %select4, i64 %load6

%store-select = select i1 %cmp1, i64 3, i64 4		%store-select = select i1 %cmp1, i64 3, i64 4
store i64 %store-select, i64* @var64, align 8		store i64 %store-select, i64* @var64, align 8
ret i64 %select5		ret i64 %select5
}		}

		; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
		; with fastmath on the select.
define float @fmaxv6() {		define float @fmaxv6() {
; GFX9-LABEL: @fmaxv6(		; GFX9-LABEL: @fmaxv6(
; GFX9-NEXT: [[TMP1:%.]] = load <2 x float>, <2 x float> bitcast ([32 x float]* @farr to <2 x float>*), align 16		; GFX9-NEXT: [[TMP1:%.]] = load <2 x float>, <2 x float> bitcast ([32 x float]* @farr to <2 x float>*), align 16
; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0		; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1		; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]		; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]]		; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]]
; GFX9-NEXT: [[TMP4:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <4 x float>*), align 8		; GFX9-NEXT: [[LOAD3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>		; GFX9-NEXT: [[CMP2:%.*]] = fcmp fast ogt float [[SELECT1]], [[LOAD3]]
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP4]], [[RDX_SHUF]]		; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], float [[SELECT1]], float [[LOAD3]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP4]], <4 x float> [[RDX_SHUF]]		; GFX9-NEXT: [[LOAD4:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>		; GFX9-NEXT: [[CMP3:%.*]] = fcmp fast ogt float [[SELECT2]], [[LOAD4]]
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]		; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], float [[SELECT2]], float [[LOAD4]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]		; GFX9-NEXT: [[LOAD5:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16
; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0		; GFX9-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[SELECT3]], [[LOAD5]]
; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast ogt float [[TMP5]], [[SELECT1]]		; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], float [[SELECT3]], float [[LOAD5]]
; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], float [[TMP5]], float [[SELECT1]]		; GFX9-NEXT: [[LOAD6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4
		; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast ogt float [[SELECT4]], [[LOAD6]]
		; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], float [[SELECT4]], float [[LOAD6]]
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00		; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00
; GFX9-NEXT: store float [[STORE_SELECT]], float* @fvar, align 8		; GFX9-NEXT: store float [[STORE_SELECT]], float* @fvar, align 8
; GFX9-NEXT: ret float [[OP_EXTRA]]		; GFX9-NEXT: ret float [[SELECT5]]
;		;
%load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16		%load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16
%load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4		%load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4
%cmp1 = fcmp fast ogt float %load1, %load2		%cmp1 = fcmp fast ogt float %load1, %load2
%select1 = select i1 %cmp1, float %load1, float %load2		%select1 = select i1 %cmp1, float %load1, float %load2

%load3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8		%load3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8
%cmp2 = fcmp fast ogt float %select1, %load3		%cmp2 = fcmp fast ogt float %select1, %load3
Show All 11 Lines	;
%cmp5 = fcmp fast ogt float %select4, %load6		%cmp5 = fcmp fast ogt float %select4, %load6
%select5 = select i1 %cmp5, float %select4, float %load6		%select5 = select i1 %cmp5, float %select4, float %load6

%store-select = select i1 %cmp1, float 3.0, float 4.0		%store-select = select i1 %cmp1, float 3.0, float 4.0
store float %store-select, float* @fvar, align 8		store float %store-select, float* @fvar, align 8
ret float %select5		ret float %select5
}		}

		; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
		; with fastmath on the select.
define double @dminv6() {		define double @dminv6() {
; GFX9-LABEL: @dminv6(		; GFX9-LABEL: @dminv6(
; GFX9-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([32 x double]* @darr to <2 x double>*), align 16		; GFX9-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([32 x double]* @darr to <2 x double>*), align 16
; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0		; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1		; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]]		; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]]
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]]		; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]]
; GFX9-NEXT: [[TMP4:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <4 x double>*), align 8		; GFX9-NEXT: [[LOAD3:%.]] = load double, double getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>		; GFX9-NEXT: [[CMP2:%.*]] = fcmp fast olt double [[SELECT1]], [[LOAD3]]
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x double> [[TMP4]], [[RDX_SHUF]]		; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], double [[SELECT1]], double [[LOAD3]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x double> [[TMP4]], <4 x double> [[RDX_SHUF]]		; GFX9-NEXT: [[LOAD4:%.]] = load double, double getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>		; GFX9-NEXT: [[CMP3:%.*]] = fcmp fast olt double [[SELECT2]], [[LOAD4]]
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x double> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]		; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], double [[SELECT2]], double [[LOAD4]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> [[RDX_SHUF1]]		; GFX9-NEXT: [[LOAD5:%.]] = load double, double getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16
; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[RDX_MINMAX_SELECT3]], i32 0		; GFX9-NEXT: [[CMP4:%.*]] = fcmp fast olt double [[SELECT3]], [[LOAD5]]
; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast olt double [[TMP5]], [[SELECT1]]		; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], double [[SELECT3]], double [[LOAD5]]
; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], double [[TMP5]], double [[SELECT1]]		; GFX9-NEXT: [[LOAD6:%.]] = load double, double getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4
		; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast olt double [[SELECT4]], [[LOAD6]]
		; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], double [[SELECT4]], double [[LOAD6]]
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00		; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00
; GFX9-NEXT: store double [[STORE_SELECT]], double* @dvar, align 8		; GFX9-NEXT: store double [[STORE_SELECT]], double* @dvar, align 8
; GFX9-NEXT: ret double [[OP_EXTRA]]		; GFX9-NEXT: ret double [[SELECT5]]
;		;
%load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16		%load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16
%load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4		%load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4
%cmp1 = fcmp fast olt double %load1, %load2		%cmp1 = fcmp fast olt double %load1, %load2
%select1 = select i1 %cmp1, double %load1, double %load2		%select1 = select i1 %cmp1, double %load1, double %load2

%load3 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8		%load3 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8
%cmp2 = fcmp fast olt double %select1, %load3		%cmp2 = fcmp fast olt double %select1, %load3
▲ Show 20 Lines • Show All 66 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll

Show First 20 Lines • Show All 605 Lines • ▼ Show 20 Lines	entry:
%cmp2 = icmp sgt i16 %elt2, %max1		%cmp2 = icmp sgt i16 %elt2, %max1
%max2 = select i1 %cmp2, i16 %elt2, i16 %max1		%max2 = select i1 %cmp2, i16 %elt2, i16 %max1
%cmp3 = icmp sgt i16 %elt3, %max2		%cmp3 = icmp sgt i16 %elt3, %max2
%max3 = select i1 %cmp3, i16 %elt3, i16 %max2		%max3 = select i1 %cmp3, i16 %elt3, i16 %max2

ret i16 %max3		ret i16 %max3
}		}

		; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
		; with fastmath on the select.
define half @reduction_fmax_v4half(<4 x half> %vec4) {		define half @reduction_fmax_v4half(<4 x half> %vec4) {
; GFX9-LABEL: @reduction_fmax_v4half(		; GCN-LABEL: @reduction_fmax_v4half(
; GFX9-NEXT: entry:		; GCN-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.]] = shufflevector <4 x half> [[VEC4:%.]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>		; GCN-NEXT: [[ELT0:%.]] = extractelement <4 x half> [[VEC4:%.]], i64 0
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x half> [[VEC4]], [[RDX_SHUF]]		; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]]		; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>		; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]		; GCN-NEXT: [[CMP1:%.*]] = fcmp fast ogt half [[ELT1]], [[ELT0]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]]		; GCN-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0		; GCN-NEXT: [[CMP2:%.*]] = fcmp fast ogt half [[ELT2]], [[MAX1]]
; GFX9-NEXT: ret half [[TMP0]]		; GCN-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MAX1]]
;		; GCN-NEXT: [[CMP3:%.*]] = fcmp fast ogt half [[ELT3]], [[MAX2]]
; VI-LABEL: @reduction_fmax_v4half(		; GCN-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MAX2]]
; VI-NEXT: entry:		; GCN-NEXT: ret half [[MAX3]]
; VI-NEXT: [[ELT0:%.]] = extractelement <4 x half> [[VEC4:%.]], i64 0
; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
; VI-NEXT: [[CMP1:%.*]] = fcmp fast ogt half [[ELT1]], [[ELT0]]
; VI-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
; VI-NEXT: [[CMP2:%.*]] = fcmp fast ogt half [[ELT2]], [[MAX1]]
; VI-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MAX1]]
; VI-NEXT: [[CMP3:%.*]] = fcmp fast ogt half [[ELT3]], [[MAX2]]
; VI-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MAX2]]
; VI-NEXT: ret half [[MAX3]]
;		;
entry:		entry:
%elt0 = extractelement <4 x half> %vec4, i64 0		%elt0 = extractelement <4 x half> %vec4, i64 0
%elt1 = extractelement <4 x half> %vec4, i64 1		%elt1 = extractelement <4 x half> %vec4, i64 1
%elt2 = extractelement <4 x half> %vec4, i64 2		%elt2 = extractelement <4 x half> %vec4, i64 2
%elt3 = extractelement <4 x half> %vec4, i64 3		%elt3 = extractelement <4 x half> %vec4, i64 3

%cmp1 = fcmp fast ogt half %elt1, %elt0		%cmp1 = fcmp fast ogt half %elt1, %elt0
%max1 = select i1 %cmp1, half %elt1, half %elt0		%max1 = select i1 %cmp1, half %elt1, half %elt0
%cmp2 = fcmp fast ogt half %elt2, %max1		%cmp2 = fcmp fast ogt half %elt2, %max1
%max2 = select i1 %cmp2, half %elt2, half %max1		%max2 = select i1 %cmp2, half %elt2, half %max1
%cmp3 = fcmp fast ogt half %elt3, %max2		%cmp3 = fcmp fast ogt half %elt3, %max2
%max3 = select i1 %cmp3, half %elt3, half %max2		%max3 = select i1 %cmp3, half %elt3, half %max2

ret half %max3		ret half %max3
}		}

		; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
		; with fastmath on the select.
define half @reduction_fmin_v4half(<4 x half> %vec4) {		define half @reduction_fmin_v4half(<4 x half> %vec4) {
; GFX9-LABEL: @reduction_fmin_v4half(		; GCN-LABEL: @reduction_fmin_v4half(
; GFX9-NEXT: entry:		; GCN-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.]] = shufflevector <4 x half> [[VEC4:%.]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>		; GCN-NEXT: [[ELT0:%.]] = extractelement <4 x half> [[VEC4:%.]], i64 0
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x half> [[VEC4]], [[RDX_SHUF]]		; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]]		; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>		; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]		; GCN-NEXT: [[CMP1:%.*]] = fcmp fast olt half [[ELT1]], [[ELT0]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]]		; GCN-NEXT: [[MIN1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0		; GCN-NEXT: [[CMP2:%.*]] = fcmp fast olt half [[ELT2]], [[MIN1]]
; GFX9-NEXT: ret half [[TMP0]]		; GCN-NEXT: [[MIN2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MIN1]]
;		; GCN-NEXT: [[CMP3:%.*]] = fcmp fast olt half [[ELT3]], [[MIN2]]
; VI-LABEL: @reduction_fmin_v4half(		; GCN-NEXT: [[MIN3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MIN2]]
; VI-NEXT: entry:		; GCN-NEXT: ret half [[MIN3]]
; VI-NEXT: [[ELT0:%.]] = extractelement <4 x half> [[VEC4:%.]], i64 0
; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
; VI-NEXT: [[CMP1:%.*]] = fcmp fast olt half [[ELT1]], [[ELT0]]
; VI-NEXT: [[MIN1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
; VI-NEXT: [[CMP2:%.*]] = fcmp fast olt half [[ELT2]], [[MIN1]]
; VI-NEXT: [[MIN2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MIN1]]
; VI-NEXT: [[CMP3:%.*]] = fcmp fast olt half [[ELT3]], [[MIN2]]
; VI-NEXT: [[MIN3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MIN2]]
; VI-NEXT: ret half [[MIN3]]
;		;
entry:		entry:
%elt0 = extractelement <4 x half> %vec4, i64 0		%elt0 = extractelement <4 x half> %vec4, i64 0
%elt1 = extractelement <4 x half> %vec4, i64 1		%elt1 = extractelement <4 x half> %vec4, i64 1
%elt2 = extractelement <4 x half> %vec4, i64 2		%elt2 = extractelement <4 x half> %vec4, i64 2
%elt3 = extractelement <4 x half> %vec4, i64 3		%elt3 = extractelement <4 x half> %vec4, i64 3

%cmp1 = fcmp fast olt half %elt1, %elt0		%cmp1 = fcmp fast olt half %elt1, %elt0
Show All 25 Lines	entry:
%elt2 = extractelement <4 x float> %a, i64 2		%elt2 = extractelement <4 x float> %a, i64 2
%elt3 = extractelement <4 x float> %a, i64 3		%elt3 = extractelement <4 x float> %a, i64 3

%add1 = fadd fast float %elt1, %elt0		%add1 = fadd fast float %elt1, %elt0
%add2 = fadd fast float %elt2, %add1		%add2 = fadd fast float %elt2, %add1
%add3 = fadd fast float %elt3, %add2		%add3 = fadd fast float %elt3, %add2

ret float %add3		ret float %add3
}		}
No newline at end of file

llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll

Show First 20 Lines • Show All 260 Lines • ▼ Show 20 Lines	entry:
%mul.3 = fmul fast float %10, %9		%mul.3 = fmul fast float %10, %9
%11 = fadd fast float %mul.3, %8		%11 = fadd fast float %mul.3, %8
%12 = fmul fast float %conv, %11		%12 = fmul fast float %conv, %11
%conv4 = fptosi float %12 to i32		%conv4 = fptosi float %12 to i32
store i32 %conv4, i32* @n, align 4		store i32 %conv4, i32* @n, align 4
ret i32 %conv4		ret i32 %conv4
}		}

		; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
		; with fastmath on the select.
define float @bar() {		define float @bar() {
; CHECK-LABEL: @bar(		; CHECK-LABEL: @bar(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> bitcast ([20 x float]* @arr to <4 x float>*), align 16		; CHECK-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> bitcast ([20 x float]* @arr to <2 x float>*), align 16
; CHECK-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([20 x float]* @arr1 to <4 x float>*), align 16		; CHECK-NEXT: [[TMP1:%.]] = load <2 x float>, <2 x float> bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]		; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> [[TMP2]])		; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; CHECK-NEXT: store float [[TMP3]], float* @res, align 4		; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
; CHECK-NEXT: ret float [[TMP3]]		; CHECK-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
		; CHECK-NEXT: [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
		; CHECK-NEXT: [[TMP5:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
		; CHECK-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
		; CHECK-NEXT: [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
		; CHECK-NEXT: [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
		; CHECK-NEXT: [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
		; CHECK-NEXT: [[TMP7:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
		; CHECK-NEXT: [[TMP8:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
		; CHECK-NEXT: [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
		; CHECK-NEXT: [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
		; CHECK-NEXT: [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
		; CHECK-NEXT: store float [[MAX_0_MUL3_2]], float* @res, align 4
		; CHECK-NEXT: ret float [[MAX_0_MUL3_2]]
;		;
; THRESHOLD-LABEL: @bar(		; THRESHOLD-LABEL: @bar(
; THRESHOLD-NEXT: entry:		; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[TMP0:%.]] = load <4 x float>, <4 x float> bitcast ([20 x float]* @arr to <4 x float>*), align 16		; THRESHOLD-NEXT: [[TMP0:%.]] = load <2 x float>, <2 x float> bitcast ([20 x float]* @arr to <2 x float>*), align 16
; THRESHOLD-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([20 x float]* @arr1 to <4 x float>*), align 16		; THRESHOLD-NEXT: [[TMP1:%.]] = load <2 x float>, <2 x float> bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]		; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> [[TMP2]])		; THRESHOLD-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; THRESHOLD-NEXT: store float [[TMP3]], float* @res, align 4		; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
; THRESHOLD-NEXT: ret float [[TMP3]]		; THRESHOLD-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
		; THRESHOLD-NEXT: [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
		; THRESHOLD-NEXT: [[TMP5:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
		; THRESHOLD-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
		; THRESHOLD-NEXT: [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
		; THRESHOLD-NEXT: [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
		; THRESHOLD-NEXT: [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
		; THRESHOLD-NEXT: [[TMP7:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
		; THRESHOLD-NEXT: [[TMP8:%.]] = load float, float getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
		; THRESHOLD-NEXT: [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
		; THRESHOLD-NEXT: [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
		; THRESHOLD-NEXT: [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
		; THRESHOLD-NEXT: store float [[MAX_0_MUL3_2]], float* @res, align 4
		; THRESHOLD-NEXT: ret float [[MAX_0_MUL3_2]]
;		;
entry:		entry:
%0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16		%0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
%1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16		%1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
%mul = fmul fast float %1, %0		%mul = fmul fast float %1, %0
%2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4		%2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
%3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4		%3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
%mul3 = fmul fast float %3, %2		%mul3 = fmul fast float %3, %2
▲ Show 20 Lines • Show All 937 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

Show First 20 Lines • Show All 192 Lines • ▼ Show 20 Lines	;
%91 = icmp sgt i32 %89, %90		%91 = icmp sgt i32 %89, %90
%92 = select i1 %91, i32 %89, i32 %90		%92 = select i1 %91, i32 %89, i32 %90
%93 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4		%93 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
%94 = icmp sgt i32 %92, %93		%94 = icmp sgt i32 %92, %93
%95 = select i1 %94, i32 %92, i32 %93		%95 = select i1 %94, i32 %92, i32 %93
ret i32 %95		ret i32 %95
}		}

		; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
		; with fastmath on the select.
define float @maxf8(float) {		define float @maxf8(float) {
; CHECK-LABEL: @maxf8(		; DEFAULT-LABEL: @maxf8(
; CHECK-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast ([32 x float]* @arr1 to <8 x float>*), align 16		; DEFAULT-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> [[TMP2]])		; DEFAULT-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
; CHECK-NEXT: ret float [[TMP3]]		; DEFAULT-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
		; DEFAULT-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
		; DEFAULT-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
		; DEFAULT-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
		; DEFAULT-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
		; DEFAULT-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
		; DEFAULT-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
		; DEFAULT-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
		; DEFAULT-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
		; DEFAULT-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
		; DEFAULT-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
		; DEFAULT-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
		; DEFAULT-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
		; DEFAULT-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
		; DEFAULT-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
		; DEFAULT-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
		; DEFAULT-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
		; DEFAULT-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
		; DEFAULT-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
		; DEFAULT-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
		; DEFAULT-NEXT: ret float [[TMP23]]
		;
		; THRESH-LABEL: @maxf8(
		; THRESH-NEXT: [[TMP2:%.]] = load <2 x float>, <2 x float> bitcast ([32 x float]* @arr1 to <2 x float>*), align 16
		; THRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
		; THRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
		; THRESH-NEXT: [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
		; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]]
		; THRESH-NEXT: [[TMP7:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
		; THRESH-NEXT: [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
		; THRESH-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]]
		; THRESH-NEXT: [[TMP10:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
		; THRESH-NEXT: [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
		; THRESH-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float [[TMP10]]
		; THRESH-NEXT: [[TMP13:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
		; THRESH-NEXT: [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
		; THRESH-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float [[TMP13]]
		; THRESH-NEXT: [[TMP16:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
		; THRESH-NEXT: [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
		; THRESH-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float [[TMP16]]
		; THRESH-NEXT: [[TMP19:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
		; THRESH-NEXT: [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
		; THRESH-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float [[TMP19]]
		; THRESH-NEXT: [[TMP22:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
		; THRESH-NEXT: [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
		; THRESH-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float [[TMP22]]
		; THRESH-NEXT: ret float [[TMP24]]
;		;
%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
%4 = fcmp fast ogt float %2, %3		%4 = fcmp fast ogt float %2, %3
%5 = select i1 %4, float %2, float %3		%5 = select i1 %4, float %2, float %3
%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
%7 = fcmp fast ogt float %5, %6		%7 = fcmp fast ogt float %5, %6
%8 = select i1 %7, float %5, float %6		%8 = select i1 %7, float %5, float %6
Show All 10 Lines	;
%19 = fcmp fast ogt float %17, %18		%19 = fcmp fast ogt float %17, %18
%20 = select i1 %19, float %17, float %18		%20 = select i1 %19, float %17, float %18
%21 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4		%21 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
%22 = fcmp fast ogt float %20, %21		%22 = fcmp fast ogt float %20, %21
%23 = select i1 %22, float %20, float %21		%23 = select i1 %22, float %20, float %21
ret float %23		ret float %23
}		}

		; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
		; with fastmath on the select.
define float @maxf16(float) {		define float @maxf16(float) {
; CHECK-LABEL: @maxf16(		; DEFAULT-LABEL: @maxf16(
; CHECK-NEXT: [[TMP2:%.]] = load <16 x float>, <16 x float> bitcast ([32 x float]* @arr1 to <16 x float>*), align 16		; DEFAULT-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> [[TMP2]])		; DEFAULT-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
; CHECK-NEXT: ret float [[TMP3]]		; DEFAULT-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
		; DEFAULT-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
		; DEFAULT-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
		; DEFAULT-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
		; DEFAULT-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
		; DEFAULT-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
		; DEFAULT-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
		; DEFAULT-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
		; DEFAULT-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
		; DEFAULT-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
		; DEFAULT-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
		; DEFAULT-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
		; DEFAULT-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
		; DEFAULT-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
		; DEFAULT-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
		; DEFAULT-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
		; DEFAULT-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
		; DEFAULT-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
		; DEFAULT-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
		; DEFAULT-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
		; DEFAULT-NEXT: [[TMP24:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
		; DEFAULT-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
		; DEFAULT-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
		; DEFAULT-NEXT: [[TMP27:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
		; DEFAULT-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
		; DEFAULT-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
		; DEFAULT-NEXT: [[TMP30:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
		; DEFAULT-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
		; DEFAULT-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
		; DEFAULT-NEXT: [[TMP33:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
		; DEFAULT-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
		; DEFAULT-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
		; DEFAULT-NEXT: [[TMP36:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
		; DEFAULT-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
		; DEFAULT-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
		; DEFAULT-NEXT: [[TMP39:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
		; DEFAULT-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
		; DEFAULT-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
		; DEFAULT-NEXT: [[TMP42:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
		; DEFAULT-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
		; DEFAULT-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
		; DEFAULT-NEXT: [[TMP45:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
		; DEFAULT-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
		; DEFAULT-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
		; DEFAULT-NEXT: ret float [[TMP47]]
		;
		; THRESH-LABEL: @maxf16(
		; THRESH-NEXT: [[TMP2:%.]] = load <2 x float>, <2 x float> bitcast ([32 x float]* @arr1 to <2 x float>*), align 16
		; THRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
		; THRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
		; THRESH-NEXT: [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
		; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]]
		; THRESH-NEXT: [[TMP7:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
		; THRESH-NEXT: [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
		; THRESH-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]]
		; THRESH-NEXT: [[TMP10:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
		; THRESH-NEXT: [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
		; THRESH-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float [[TMP10]]
		; THRESH-NEXT: [[TMP13:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
		; THRESH-NEXT: [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
		; THRESH-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float [[TMP13]]
		; THRESH-NEXT: [[TMP16:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
		; THRESH-NEXT: [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
		; THRESH-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float [[TMP16]]
		; THRESH-NEXT: [[TMP19:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
		; THRESH-NEXT: [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
		; THRESH-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float [[TMP19]]
		; THRESH-NEXT: [[TMP22:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
		; THRESH-NEXT: [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
		; THRESH-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float [[TMP22]]
		; THRESH-NEXT: [[TMP25:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
		; THRESH-NEXT: [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
		; THRESH-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float [[TMP25]]
		; THRESH-NEXT: [[TMP28:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
		; THRESH-NEXT: [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
		; THRESH-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float [[TMP28]]
		; THRESH-NEXT: [[TMP31:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
		; THRESH-NEXT: [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
		; THRESH-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float [[TMP31]]
		; THRESH-NEXT: [[TMP34:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
		; THRESH-NEXT: [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
		; THRESH-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float [[TMP34]]
		; THRESH-NEXT: [[TMP37:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
		; THRESH-NEXT: [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
		; THRESH-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float [[TMP37]]
		; THRESH-NEXT: [[TMP40:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
		; THRESH-NEXT: [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
		; THRESH-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float [[TMP40]]
		; THRESH-NEXT: [[TMP43:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
		; THRESH-NEXT: [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
		; THRESH-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float [[TMP43]]
		; THRESH-NEXT: [[TMP46:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
		; THRESH-NEXT: [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
		; THRESH-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float [[TMP46]]
		; THRESH-NEXT: ret float [[TMP48]]
;		;
%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
%4 = fcmp fast ogt float %2, %3		%4 = fcmp fast ogt float %2, %3
%5 = select i1 %4, float %2, float %3		%5 = select i1 %4, float %2, float %3
%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
%7 = fcmp fast ogt float %5, %6		%7 = fcmp fast ogt float %5, %6
%8 = select i1 %7, float %5, float %6		%8 = select i1 %7, float %5, float %6
Show All 34 Lines	;
%43 = fcmp fast ogt float %41, %42		%43 = fcmp fast ogt float %41, %42
%44 = select i1 %43, float %41, float %42		%44 = select i1 %43, float %41, float %42
%45 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4		%45 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
%46 = fcmp fast ogt float %44, %45		%46 = fcmp fast ogt float %44, %45
%47 = select i1 %46, float %44, float %45		%47 = select i1 %46, float %44, float %45
ret float %47		ret float %47
}		}

		; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
		; with fastmath on the select.
define float @maxf32(float) {		define float @maxf32(float) {
; CHECK-LABEL: @maxf32(		; DEFAULT-LABEL: @maxf32(
; CHECK-NEXT: [[TMP2:%.]] = load <32 x float>, <32 x float> bitcast ([32 x float]* @arr1 to <32 x float>*), align 16		; DEFAULT-NEXT: [[TMP2:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> [[TMP2]])		; DEFAULT-NEXT: [[TMP3:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
; CHECK-NEXT: ret float [[TMP3]]		; DEFAULT-NEXT: [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
		; DEFAULT-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
		; DEFAULT-NEXT: [[TMP6:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
		; DEFAULT-NEXT: [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
		; DEFAULT-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
		; DEFAULT-NEXT: [[TMP9:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
		; DEFAULT-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
		; DEFAULT-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
		; DEFAULT-NEXT: [[TMP12:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
		; DEFAULT-NEXT: [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
		; DEFAULT-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
		; DEFAULT-NEXT: [[TMP15:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
		; DEFAULT-NEXT: [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
		; DEFAULT-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
		; DEFAULT-NEXT: [[TMP18:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
		; DEFAULT-NEXT: [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
		; DEFAULT-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
		; DEFAULT-NEXT: [[TMP21:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
		; DEFAULT-NEXT: [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
		; DEFAULT-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
		; DEFAULT-NEXT: [[TMP24:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
		; DEFAULT-NEXT: [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
		; DEFAULT-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
		; DEFAULT-NEXT: [[TMP27:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
		; DEFAULT-NEXT: [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
		; DEFAULT-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
		; DEFAULT-NEXT: [[TMP30:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
		; DEFAULT-NEXT: [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
		; DEFAULT-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
		; DEFAULT-NEXT: [[TMP33:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
		; DEFAULT-NEXT: [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
		; DEFAULT-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
		; DEFAULT-NEXT: [[TMP36:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
		; DEFAULT-NEXT: [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
		; DEFAULT-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
		; DEFAULT-NEXT: [[TMP39:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
		; DEFAULT-NEXT: [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
		; DEFAULT-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
		; DEFAULT-NEXT: [[TMP42:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
		; DEFAULT-NEXT: [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
		; DEFAULT-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
		; DEFAULT-NEXT: [[TMP45:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
		; DEFAULT-NEXT: [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
		; DEFAULT-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
		; DEFAULT-NEXT: [[TMP48:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
		; DEFAULT-NEXT: [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
		; DEFAULT-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
		; DEFAULT-NEXT: [[TMP51:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
		; DEFAULT-NEXT: [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
		; DEFAULT-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
		; DEFAULT-NEXT: [[TMP54:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
		; DEFAULT-NEXT: [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
		; DEFAULT-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
		; DEFAULT-NEXT: [[TMP57:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
		; DEFAULT-NEXT: [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
		; DEFAULT-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
		; DEFAULT-NEXT: [[TMP60:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
		; DEFAULT-NEXT: [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
		; DEFAULT-NEXT: [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
		; DEFAULT-NEXT: [[TMP63:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
		; DEFAULT-NEXT: [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
		; DEFAULT-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
		; DEFAULT-NEXT: [[TMP66:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
		; DEFAULT-NEXT: [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
		; DEFAULT-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
		; DEFAULT-NEXT: [[TMP69:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
		; DEFAULT-NEXT: [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
		; DEFAULT-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
		; DEFAULT-NEXT: [[TMP72:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
		; DEFAULT-NEXT: [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
		; DEFAULT-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
		; DEFAULT-NEXT: [[TMP75:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
		; DEFAULT-NEXT: [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
		; DEFAULT-NEXT: [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
		; DEFAULT-NEXT: [[TMP78:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
		; DEFAULT-NEXT: [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
		; DEFAULT-NEXT: [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
		; DEFAULT-NEXT: [[TMP81:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
		; DEFAULT-NEXT: [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
		; DEFAULT-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
		; DEFAULT-NEXT: [[TMP84:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
		; DEFAULT-NEXT: [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
		; DEFAULT-NEXT: [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
		; DEFAULT-NEXT: [[TMP87:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
		; DEFAULT-NEXT: [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
		; DEFAULT-NEXT: [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
		; DEFAULT-NEXT: [[TMP90:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
		; DEFAULT-NEXT: [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
		; DEFAULT-NEXT: [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
		; DEFAULT-NEXT: [[TMP93:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
		; DEFAULT-NEXT: [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
		; DEFAULT-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
		; DEFAULT-NEXT: ret float [[TMP95]]
		;
		; THRESH-LABEL: @maxf32(
		; THRESH-NEXT: [[TMP2:%.]] = load <2 x float>, <2 x float> bitcast ([32 x float]* @arr1 to <2 x float>*), align 16
		; THRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
		; THRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
		; THRESH-NEXT: [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
		; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]]
		; THRESH-NEXT: [[TMP7:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
		; THRESH-NEXT: [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
		; THRESH-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]]
		; THRESH-NEXT: [[TMP10:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
		; THRESH-NEXT: [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
		; THRESH-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float [[TMP10]]
		; THRESH-NEXT: [[TMP13:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
		; THRESH-NEXT: [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
		; THRESH-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float [[TMP13]]
		; THRESH-NEXT: [[TMP16:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
		; THRESH-NEXT: [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
		; THRESH-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float [[TMP16]]
		; THRESH-NEXT: [[TMP19:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
		; THRESH-NEXT: [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
		; THRESH-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float [[TMP19]]
		; THRESH-NEXT: [[TMP22:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
		; THRESH-NEXT: [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
		; THRESH-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float [[TMP22]]
		; THRESH-NEXT: [[TMP25:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
		; THRESH-NEXT: [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
		; THRESH-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float [[TMP25]]
		; THRESH-NEXT: [[TMP28:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
		; THRESH-NEXT: [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
		; THRESH-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float [[TMP28]]
		; THRESH-NEXT: [[TMP31:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
		; THRESH-NEXT: [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
		; THRESH-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float [[TMP31]]
		; THRESH-NEXT: [[TMP34:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
		; THRESH-NEXT: [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
		; THRESH-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float [[TMP34]]
		; THRESH-NEXT: [[TMP37:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
		; THRESH-NEXT: [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
		; THRESH-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float [[TMP37]]
		; THRESH-NEXT: [[TMP40:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
		; THRESH-NEXT: [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
		; THRESH-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float [[TMP40]]
		; THRESH-NEXT: [[TMP43:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
		; THRESH-NEXT: [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
		; THRESH-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float [[TMP43]]
		; THRESH-NEXT: [[TMP46:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
		; THRESH-NEXT: [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
		; THRESH-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float [[TMP46]]
		; THRESH-NEXT: [[TMP49:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
		; THRESH-NEXT: [[TMP50:%.*]] = fcmp fast ogt float [[TMP48]], [[TMP49]]
		; THRESH-NEXT: [[TMP51:%.*]] = select i1 [[TMP50]], float [[TMP48]], float [[TMP49]]
		; THRESH-NEXT: [[TMP52:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
		; THRESH-NEXT: [[TMP53:%.*]] = fcmp fast ogt float [[TMP51]], [[TMP52]]
		; THRESH-NEXT: [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP51]], float [[TMP52]]
		; THRESH-NEXT: [[TMP55:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
		; THRESH-NEXT: [[TMP56:%.*]] = fcmp fast ogt float [[TMP54]], [[TMP55]]
		; THRESH-NEXT: [[TMP57:%.*]] = select i1 [[TMP56]], float [[TMP54]], float [[TMP55]]
		; THRESH-NEXT: [[TMP58:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
		; THRESH-NEXT: [[TMP59:%.*]] = fcmp fast ogt float [[TMP57]], [[TMP58]]
		; THRESH-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP57]], float [[TMP58]]
		; THRESH-NEXT: [[TMP61:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
		; THRESH-NEXT: [[TMP62:%.*]] = fcmp fast ogt float [[TMP60]], [[TMP61]]
		; THRESH-NEXT: [[TMP63:%.*]] = select i1 [[TMP62]], float [[TMP60]], float [[TMP61]]
		; THRESH-NEXT: [[TMP64:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
		; THRESH-NEXT: [[TMP65:%.*]] = fcmp fast ogt float [[TMP63]], [[TMP64]]
		; THRESH-NEXT: [[TMP66:%.*]] = select i1 [[TMP65]], float [[TMP63]], float [[TMP64]]
		; THRESH-NEXT: [[TMP67:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
		; THRESH-NEXT: [[TMP68:%.*]] = fcmp fast ogt float [[TMP66]], [[TMP67]]
		; THRESH-NEXT: [[TMP69:%.*]] = select i1 [[TMP68]], float [[TMP66]], float [[TMP67]]
		; THRESH-NEXT: [[TMP70:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
		; THRESH-NEXT: [[TMP71:%.*]] = fcmp fast ogt float [[TMP69]], [[TMP70]]
		; THRESH-NEXT: [[TMP72:%.*]] = select i1 [[TMP71]], float [[TMP69]], float [[TMP70]]
		; THRESH-NEXT: [[TMP73:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
		; THRESH-NEXT: [[TMP74:%.*]] = fcmp fast ogt float [[TMP72]], [[TMP73]]
		; THRESH-NEXT: [[TMP75:%.*]] = select i1 [[TMP74]], float [[TMP72]], float [[TMP73]]
		; THRESH-NEXT: [[TMP76:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
		; THRESH-NEXT: [[TMP77:%.*]] = fcmp fast ogt float [[TMP75]], [[TMP76]]
		; THRESH-NEXT: [[TMP78:%.*]] = select i1 [[TMP77]], float [[TMP75]], float [[TMP76]]
		; THRESH-NEXT: [[TMP79:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
		; THRESH-NEXT: [[TMP80:%.*]] = fcmp fast ogt float [[TMP78]], [[TMP79]]
		; THRESH-NEXT: [[TMP81:%.*]] = select i1 [[TMP80]], float [[TMP78]], float [[TMP79]]
		; THRESH-NEXT: [[TMP82:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
		; THRESH-NEXT: [[TMP83:%.*]] = fcmp fast ogt float [[TMP81]], [[TMP82]]
		; THRESH-NEXT: [[TMP84:%.*]] = select i1 [[TMP83]], float [[TMP81]], float [[TMP82]]
		; THRESH-NEXT: [[TMP85:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
		; THRESH-NEXT: [[TMP86:%.*]] = fcmp fast ogt float [[TMP84]], [[TMP85]]
		; THRESH-NEXT: [[TMP87:%.*]] = select i1 [[TMP86]], float [[TMP84]], float [[TMP85]]
		; THRESH-NEXT: [[TMP88:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
		; THRESH-NEXT: [[TMP89:%.*]] = fcmp fast ogt float [[TMP87]], [[TMP88]]
		; THRESH-NEXT: [[TMP90:%.*]] = select i1 [[TMP89]], float [[TMP87]], float [[TMP88]]
		; THRESH-NEXT: [[TMP91:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
		; THRESH-NEXT: [[TMP92:%.*]] = fcmp fast ogt float [[TMP90]], [[TMP91]]
		; THRESH-NEXT: [[TMP93:%.*]] = select i1 [[TMP92]], float [[TMP90]], float [[TMP91]]
		; THRESH-NEXT: [[TMP94:%.]] = load float, float getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
		; THRESH-NEXT: [[TMP95:%.*]] = fcmp fast ogt float [[TMP93]], [[TMP94]]
		; THRESH-NEXT: [[TMP96:%.*]] = select i1 [[TMP95]], float [[TMP93]], float [[TMP94]]
		; THRESH-NEXT: ret float [[TMP96]]
;		;
%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16		%2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4		%3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
%4 = fcmp fast ogt float %2, %3		%4 = fcmp fast ogt float %2, %3
%5 = select i1 %4, float %2, float %3		%5 = select i1 %4, float %2, float %3
%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8		%6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
%7 = fcmp fast ogt float %5, %6		%7 = fcmp fast ogt float %5, %6
%8 = select i1 %7, float %5, float %6		%8 = select i1 %7, float %5, float %6
▲ Show 20 Lines • Show All 368 Lines • Show Last 20 Lines