diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -353,6 +353,9 @@ SinkAndHoistLICMFlags &LICMFlags, OptimizationRemarkEmitter *ORE = nullptr); +/// Returns the min/max intrinsic used when expanding a min/max reduction. +Intrinsic::ID getMinMaxReductionIntrinsicOp(RecurKind RK); + /// Returns the comparison predicate used when expanding a min/max reduction. CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -893,6 +893,25 @@ return true; } +Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) { + switch (RK) { + default: + llvm_unreachable("Unknown min/max recurrence kind"); + case RecurKind::UMin: + return Intrinsic::umin; + case RecurKind::UMax: + return Intrinsic::umax; + case RecurKind::SMin: + return Intrinsic::smin; + case RecurKind::SMax: + return Intrinsic::smax; + case RecurKind::FMin: + return Intrinsic::minnum; + case RecurKind::FMax: + return Intrinsic::maxnum; + } +} + CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) { switch (RK) { default: @@ -923,6 +942,7 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right) { + // TODO: Should we create a min/max intrinsic instead of cmp+sel? CmpInst::Predicate Pred = getMinMaxReductionPredicate(RK); Value *Cmp = Builder.CreateCmp(Pred, Left, Right, "rdx.minmax.cmp"); Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13622,7 +13622,6 @@ } case RecurKind::FMax: case RecurKind::FMin: { - auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); if (!AllConsts) { auto *VecCondTy = cast(CmpInst::makeCmpResultType(VectorTy)); @@ -13630,12 +13629,10 @@ TTI->getMinMaxReductionCost(VectorTy, VecCondTy, /*IsUnsigned=*/false, FMF, CostKind); } - CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); + Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); ScalarCost = EvaluateScalarCost([&]() { - return TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy, - RdxPred, CostKind) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy, - RdxPred, CostKind); + IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF); + return TTI->getIntrinsicInstrCost(ICA, CostKind); }); break; } @@ -13643,7 +13640,6 @@ case RecurKind::SMin: case RecurKind::UMax: case RecurKind::UMin: { - auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); if (!AllConsts) { auto *VecCondTy = cast(CmpInst::makeCmpResultType(VectorTy)); @@ -13652,12 +13648,10 @@ VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned, FMF, CostKind); } - CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); + Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); ScalarCost = EvaluateScalarCost([&]() { - return TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, SclCondTy, - RdxPred, CostKind) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy, - RdxPred, CostKind); + IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF); + return TTI->getIntrinsicInstrCost(ICA, CostKind); }); break; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT -; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT -; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT -; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT +; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE4 +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -passes=slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefixes=CHECK,THRESH @arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16 @@ -765,24 +765,70 @@ } define i32 @maxi8_mutiple_uses(i32) { -; DEFAULT-LABEL: @maxi8_mutiple_uses( -; DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 -; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 -; DEFAULT-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; DEFAULT-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; DEFAULT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 -; DEFAULT-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; DEFAULT-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; DEFAULT-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]] -; DEFAULT-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]] -; DEFAULT-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]] -; DEFAULT-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]] -; DEFAULT-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] -; DEFAULT-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4 -; DEFAULT-NEXT: store i32 [[TMP10]], ptr @var, align 8 -; DEFAULT-NEXT: ret i32 [[OP_RDX5]] +; SSE2-LABEL: @maxi8_mutiple_uses( +; SSE2-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 +; SSE2-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; SSE2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] +; SSE2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] +; SSE2-NEXT: [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; SSE2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] +; SSE2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] +; SSE2-NEXT: [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4 +; SSE2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +; SSE2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] +; SSE2-NEXT: [[TMP12:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 +; SSE2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] +; SSE2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; SSE2-NEXT: [[TMP15:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4 +; SSE2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +; SSE2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] +; SSE2-NEXT: [[TMP18:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; SSE2-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +; SSE2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] +; SSE2-NEXT: [[TMP21:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 +; SSE2-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] +; SSE2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] +; SSE2-NEXT: [[TMP24:%.*]] = select i1 [[TMP4]], i32 3, i32 4 +; SSE2-NEXT: store i32 [[TMP24]], ptr @var, align 8 +; SSE2-NEXT: ret i32 [[TMP23]] +; +; SSE4-LABEL: @maxi8_mutiple_uses( +; SSE4-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 +; SSE4-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; SSE4-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] +; SSE4-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] +; SSE4-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; SSE4-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; SSE4-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 +; SSE4-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; SSE4-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] +; SSE4-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]] +; SSE4-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]] +; SSE4-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]] +; SSE4-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]] +; SSE4-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] +; SSE4-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4 +; SSE4-NEXT: store i32 [[TMP10]], ptr @var, align 8 +; SSE4-NEXT: ret i32 [[OP_RDX5]] +; +; AVX-LABEL: @maxi8_mutiple_uses( +; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 +; AVX-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] +; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 +; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; AVX-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] +; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]] +; AVX-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]] +; AVX-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]] +; AVX-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]] +; AVX-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] +; AVX-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4 +; AVX-NEXT: store i32 [[TMP10]], ptr @var, align 8 +; AVX-NEXT: ret i32 [[OP_RDX5]] ; ; THRESH-LABEL: @maxi8_mutiple_uses( ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 @@ -901,24 +947,70 @@ } define i32 @maxi8_wrong_parent(i32) { -; DEFAULT-LABEL: @maxi8_wrong_parent( -; DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 -; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 -; DEFAULT-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; DEFAULT-NEXT: br label [[PP:%.*]] -; DEFAULT: pp: -; DEFAULT-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; DEFAULT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 -; DEFAULT-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; DEFAULT-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; DEFAULT-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]] -; DEFAULT-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]] -; DEFAULT-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]] -; DEFAULT-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]] -; DEFAULT-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] -; DEFAULT-NEXT: ret i32 [[OP_RDX5]] +; SSE2-LABEL: @maxi8_wrong_parent( +; SSE2-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 +; SSE2-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; SSE2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] +; SSE2-NEXT: br label [[PP:%.*]] +; SSE2: pp: +; SSE2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] +; SSE2-NEXT: [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; SSE2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] +; SSE2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] +; SSE2-NEXT: [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4 +; SSE2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +; SSE2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] +; SSE2-NEXT: [[TMP12:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 +; SSE2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] +; SSE2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; SSE2-NEXT: [[TMP15:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4 +; SSE2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +; SSE2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] +; SSE2-NEXT: [[TMP18:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; SSE2-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +; SSE2-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]] +; SSE2-NEXT: [[TMP21:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 +; SSE2-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] +; SSE2-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]] +; SSE2-NEXT: ret i32 [[TMP23]] +; +; SSE4-LABEL: @maxi8_wrong_parent( +; SSE4-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 +; SSE4-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; SSE4-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] +; SSE4-NEXT: br label [[PP:%.*]] +; SSE4: pp: +; SSE4-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] +; SSE4-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; SSE4-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; SSE4-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 +; SSE4-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; SSE4-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] +; SSE4-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]] +; SSE4-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]] +; SSE4-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]] +; SSE4-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]] +; SSE4-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] +; SSE4-NEXT: ret i32 [[OP_RDX5]] +; +; AVX-LABEL: @maxi8_wrong_parent( +; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 +; AVX-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] +; AVX-NEXT: br label [[PP:%.*]] +; AVX: pp: +; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 +; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; AVX-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] +; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]] +; AVX-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]] +; AVX-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]] +; AVX-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]] +; AVX-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] +; AVX-NEXT: ret i32 [[OP_RDX5]] ; ; THRESH-LABEL: @maxi8_wrong_parent( ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -11,23 +11,41 @@ ; CHECK-NEXT: ret void ; CHECK: bb2: ; CHECK-NEXT: [[T:%.*]] = select i1 undef, i16 undef, i16 15 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> , i16 [[T]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i16> [[TMP0]] to <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], undef -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE2]], -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef +; CHECK-NEXT: [[T3:%.*]] = sext i16 undef to i32 +; CHECK-NEXT: [[T4:%.*]] = sext i16 [[T]] to i32 +; CHECK-NEXT: [[T5:%.*]] = sub nsw i32 undef, [[T4]] +; CHECK-NEXT: [[T6:%.*]] = sub i32 [[T5]], undef +; CHECK-NEXT: [[T7:%.*]] = sub nsw i32 63, [[T3]] +; CHECK-NEXT: [[T8:%.*]] = sub i32 [[T7]], undef +; CHECK-NEXT: [[T9:%.*]] = add i32 [[T8]], undef +; CHECK-NEXT: [[T10:%.*]] = add nsw i32 [[T6]], 15 +; CHECK-NEXT: [[T11:%.*]] = icmp sgt i32 [[T9]], [[T10]] +; CHECK-NEXT: [[T12:%.*]] = select i1 [[T11]], i32 [[T9]], i32 [[T10]] +; CHECK-NEXT: [[T13:%.*]] = add nsw i32 [[T6]], 31 +; CHECK-NEXT: [[T14:%.*]] = icmp sgt i32 [[T12]], [[T13]] +; CHECK-NEXT: [[T15:%.*]] = select i1 [[T14]], i32 [[T12]], i32 [[T13]] +; CHECK-NEXT: [[T16:%.*]] = add nsw i32 [[T6]], 47 +; CHECK-NEXT: [[T17:%.*]] = icmp sgt i32 [[T15]], [[T16]] +; CHECK-NEXT: [[T18:%.*]] = select i1 [[T17]], i32 [[T15]], i32 [[T16]] +; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[T18]], i32 undef ; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63 -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <2 x i32> undef, [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP6]], undef -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 undef, [[TMP9]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 undef, i32 [[TMP9]] -; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX1]] +; CHECK-NEXT: [[T21:%.*]] = sub nsw i32 undef, [[T3]] +; CHECK-NEXT: [[T22:%.*]] = sub i32 [[T21]], undef +; CHECK-NEXT: [[T23:%.*]] = sub nsw i32 undef, [[T4]] +; CHECK-NEXT: [[T24:%.*]] = sub i32 [[T23]], undef +; CHECK-NEXT: [[T25:%.*]] = add nsw i32 [[T24]], -49 +; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T22]], -33 +; CHECK-NEXT: [[T35:%.*]] = add nsw i32 [[T24]], -33 +; CHECK-NEXT: [[T40:%.*]] = add nsw i32 [[T22]], -17 +; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 undef, [[T40]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 undef, i32 [[T40]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = icmp slt i32 [[T35]], [[T30]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[T35]], i32 [[T30]] +; CHECK-NEXT: [[OP_RDX4:%.*]] = icmp slt i32 [[OP_RDX1]], [[OP_RDX3]] +; CHECK-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] +; CHECK-NEXT: [[OP_RDX6:%.*]] = icmp slt i32 [[OP_RDX5]], [[T25]] +; CHECK-NEXT: [[OP_RDX7:%.*]] = select i1 [[OP_RDX6]], i32 [[OP_RDX5]], i32 [[T25]] +; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX7]] ; CHECK-NEXT: unreachable ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -4,19 +4,32 @@ define i32 @foo(ptr nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARR:%.*]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) -; CHECK-NEXT: ret i32 [[TMP11]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[A1:%.*]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP0]], [[A2:%.*]] +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[TMP0]], [[A3:%.*]] +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[TMP0]], [[A4:%.*]] +; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[TMP0]], [[A5:%.*]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[TMP0]], [[A6:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARR]], align 4 +; CHECK-NEXT: [[ADD12:%.*]] = add i32 [[TMP1]], [[A7:%.*]] +; CHECK-NEXT: [[ADD14:%.*]] = add i32 [[TMP1]], [[A8:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]] +; CHECK-NEXT: [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]] +; CHECK-NEXT: [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]] +; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]] +; CHECK-NEXT: [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]] +; CHECK-NEXT: [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]] +; CHECK-NEXT: [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]] +; CHECK-NEXT: [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]] +; CHECK-NEXT: [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]] +; CHECK-NEXT: [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]] +; CHECK-NEXT: [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]] +; CHECK-NEXT: [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]] +; CHECK-NEXT: [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]] +; CHECK-NEXT: ret i32 [[COND44]] ; entry: %arrayidx = getelementptr inbounds i32, ptr %arr, i64 1 @@ -50,19 +63,36 @@ define i32 @foo1(ptr nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) { ; CHECK-LABEL: @foo1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARR:%.*]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) -; CHECK-NEXT: ret i32 [[TMP11]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[A1:%.*]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP1]], [[A2:%.*]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[TMP2]], [[A3:%.*]] +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[TMP0]], [[A4:%.*]] +; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[TMP0]], [[A5:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARR]], align 4 +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[TMP3]], [[A6:%.*]] +; CHECK-NEXT: [[ADD12:%.*]] = add i32 [[TMP1]], [[A7:%.*]] +; CHECK-NEXT: [[ADD14:%.*]] = add i32 [[TMP0]], [[A8:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]] +; CHECK-NEXT: [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]] +; CHECK-NEXT: [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]] +; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]] +; CHECK-NEXT: [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]] +; CHECK-NEXT: [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]] +; CHECK-NEXT: [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]] +; CHECK-NEXT: [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]] +; CHECK-NEXT: [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]] +; CHECK-NEXT: [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]] +; CHECK-NEXT: [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]] +; CHECK-NEXT: [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]] +; CHECK-NEXT: [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]] +; CHECK-NEXT: ret i32 [[COND44]] ; entry: %arrayidx = getelementptr inbounds i32, ptr %arr, i64 1 @@ -100,19 +130,36 @@ define i32 @foo2(ptr nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) { ; CHECK-LABEL: @foo2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARR:%.*]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) -; CHECK-NEXT: ret i32 [[TMP11]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[A1:%.*]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP1]], [[A2:%.*]] +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[TMP0]], [[A3:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARR]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[TMP2]], [[A4:%.*]] +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[TMP3]], [[A5:%.*]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[TMP2]], [[A6:%.*]] +; CHECK-NEXT: [[ADD12:%.*]] = add i32 [[TMP1]], [[A7:%.*]] +; CHECK-NEXT: [[ADD14:%.*]] = add i32 [[TMP3]], [[A8:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]] +; CHECK-NEXT: [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]] +; CHECK-NEXT: [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]] +; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]] +; CHECK-NEXT: [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]] +; CHECK-NEXT: [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]] +; CHECK-NEXT: [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]] +; CHECK-NEXT: [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]] +; CHECK-NEXT: [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]] +; CHECK-NEXT: [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]] +; CHECK-NEXT: [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]] +; CHECK-NEXT: [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]] +; CHECK-NEXT: [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]] +; CHECK-NEXT: ret i32 [[COND44]] ; entry: %arrayidx = getelementptr inbounds i32, ptr %arr, i64 3