diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6570,7 +6570,7 @@ /// Creates reduction operation with the current opcode. static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, - Value *RHS, const Twine &Name) { + Value *RHS, const Twine &Name, bool UseSelect) { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); switch (Kind) { case RecurKind::Add: @@ -6586,23 +6586,30 @@ return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); case RecurKind::FMin: return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); - - case RecurKind::SMax: { - Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - case RecurKind::SMin: { - Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - case RecurKind::UMax: { - Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - case RecurKind::UMin: { - Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } + case RecurKind::SMax: + if (UseSelect) { + Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS); + case RecurKind::SMin: + if (UseSelect) { + Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS); + case RecurKind::UMax: + if (UseSelect) { + Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS); + case RecurKind::UMin: + if (UseSelect) { + Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS); default: llvm_unreachable("Unknown reduction operation."); } @@ -6613,12 +6620,16 @@ static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, Value *RHS, const Twine &Name, const ReductionOpsListType &ReductionOps) { - Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); + bool UseSelect = ReductionOps.size() == 2; + assert((!UseSelect || isa(ReductionOps[1][0])) && + "Expected cmp + select pairs for reduction"); + Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect); if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { - if (auto *Sel = dyn_cast(Op)) + if (auto *Sel = dyn_cast(Op)) { propagateIRFlags(Sel->getCondition(), ReductionOps[0]); - propagateIRFlags(Op, ReductionOps[1]); - return Op; + propagateIRFlags(Op, ReductionOps[1]); + return Op; + } } propagateIRFlags(Op, ReductionOps[0]); return Op; @@ -6627,10 +6638,10 @@ /// from \p I. static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, Value *RHS, const Twine &Name, Instruction *I) { - Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); + auto *SelI = dyn_cast(I); + Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr); if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { if (auto *Sel = dyn_cast(Op)) - if (auto *SelI = dyn_cast(I)) propagateIRFlags(Sel->getCondition(), SelI->getCondition()); } propagateIRFlags(Op, I); @@ -6660,17 +6671,20 @@ if (match(I, m_Intrinsic(m_Value(), m_Value()))) return RecurKind::FMin; + // This matches either cmp+select or intrinsics. SLP is expected to handle + // either form. + // TODO: If we are canonicalizing to intrinsics, we can remove several + // special-case paths that deal with selects. + if (match(I, m_SMax(m_Value(), m_Value()))) + return RecurKind::SMax; + if (match(I, m_SMin(m_Value(), m_Value()))) + return RecurKind::SMin; + if (match(I, m_UMax(m_Value(), m_Value()))) + return RecurKind::UMax; + if (match(I, m_UMin(m_Value(), m_Value()))) + return RecurKind::UMin; + if (auto *Select = dyn_cast(I)) { - // These would also match llvm.{u,s}{min,max} intrinsic call - // if were not guarded by the SelectInst check above. - if (match(I, m_SMax(m_Value(), m_Value()))) - return RecurKind::SMax; - if (match(I, m_SMin(m_Value(), m_Value()))) - return RecurKind::SMin; - if (match(I, m_UMax(m_Value(), m_Value()))) - return RecurKind::UMax; - if (match(I, m_UMin(m_Value(), m_Value()))) - return RecurKind::UMin; // Try harder: look for min/max pattern based on instructions producing // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). // During the intermediate stages of SLP, it's very common to have @@ -7384,6 +7398,14 @@ return true; if (match(I, m_Intrinsic(m_Value(V0), m_Value(V1)))) return true; + if (match(I, m_Intrinsic(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic(m_Value(V0), m_Value(V1)))) + return true; return false; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1018,22 +1018,10 @@ ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 5 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 6 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 7 -; CHECK-NEXT: [[T0:%.*]] = load i32, i32* [[P0]], align 4 -; CHECK-NEXT: [[T1:%.*]] = load i32, i32* [[P1]], align 4 -; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[P2]], align 4 -; CHECK-NEXT: [[T3:%.*]] = load i32, i32* [[P3]], align 4 -; CHECK-NEXT: [[T4:%.*]] = load i32, i32* [[P4]], align 4 -; CHECK-NEXT: [[T5:%.*]] = load i32, i32* [[P5]], align 4 -; CHECK-NEXT: [[T6:%.*]] = load i32, i32* [[P6]], align 4 -; CHECK-NEXT: [[T7:%.*]] = load i32, i32* [[P7]], align 4 -; CHECK-NEXT: [[M10:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T1]], i32 [[T0]]) -; CHECK-NEXT: [[M32:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T3]], i32 [[T2]]) -; CHECK-NEXT: [[M54:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T5]], i32 [[T4]]) -; CHECK-NEXT: [[M76:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T7]], i32 [[T6]]) -; CHECK-NEXT: [[M3210:%.*]] = tail call i32 @llvm.smax.i32(i32 [[M32]], i32 [[M10]]) -; CHECK-NEXT: [[M7654:%.*]] = tail call i32 @llvm.smax.i32(i32 [[M76]], i32 [[M54]]) -; CHECK-NEXT: [[M:%.*]] = tail call i32 @llvm.smax.i32(i32 [[M7654]], i32 [[M3210]]) -; CHECK-NEXT: ret i32 [[M]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <8 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]]) +; CHECK-NEXT: ret i32 [[TMP3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2 @@ -1069,22 +1057,10 @@ ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; CHECK-NEXT: [[T0:%.*]] = load i16, i16* [[P0]], align 4 -; CHECK-NEXT: [[T1:%.*]] = load i16, i16* [[P1]], align 4 -; CHECK-NEXT: [[T2:%.*]] = load i16, i16* [[P2]], align 4 -; CHECK-NEXT: [[T3:%.*]] = load i16, i16* [[P3]], align 4 -; CHECK-NEXT: [[T4:%.*]] = load i16, i16* [[P4]], align 4 -; CHECK-NEXT: [[T5:%.*]] = load i16, i16* [[P5]], align 4 -; CHECK-NEXT: [[T6:%.*]] = load i16, i16* [[P6]], align 4 -; CHECK-NEXT: [[T7:%.*]] = load i16, i16* [[P7]], align 4 -; CHECK-NEXT: [[M10:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T1]], i16 [[T0]]) -; CHECK-NEXT: [[M32:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T3]], i16 [[T2]]) -; CHECK-NEXT: [[M54:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T5]], i16 [[T4]]) -; CHECK-NEXT: [[M76:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T7]], i16 [[T6]]) -; CHECK-NEXT: [[M3210:%.*]] = tail call i16 @llvm.smin.i16(i16 [[M32]], i16 [[M10]]) -; CHECK-NEXT: [[M7654:%.*]] = tail call i16 @llvm.smin.i16(i16 [[M76]], i16 [[M54]]) -; CHECK-NEXT: [[M:%.*]] = tail call i16 @llvm.smin.i16(i16 [[M7654]], i16 [[M3210]]) -; CHECK-NEXT: ret i16 [[M]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[TMP2]]) +; CHECK-NEXT: ret i16 [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -1112,18 +1088,27 @@ } define i64 @umax_intrinsic_rdx_v4i64(i64* %p0) { -; CHECK-LABEL: @umax_intrinsic_rdx_v4i64( -; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i64, i64* [[P0:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 3 -; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[P0]], align 4 -; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[P1]], align 4 -; CHECK-NEXT: [[T2:%.*]] = load i64, i64* [[P2]], align 4 -; CHECK-NEXT: [[T3:%.*]] = load i64, i64* [[P3]], align 4 -; CHECK-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]]) -; CHECK-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]]) -; CHECK-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]]) -; CHECK-NEXT: ret i64 [[M]] +; DEFAULT-LABEL: @umax_intrinsic_rdx_v4i64( +; DEFAULT-NEXT: [[P1:%.*]] = getelementptr inbounds i64, i64* [[P0:%.*]], i64 1 +; DEFAULT-NEXT: [[P2:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 2 +; DEFAULT-NEXT: [[P3:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 3 +; DEFAULT-NEXT: [[T0:%.*]] = load i64, i64* [[P0]], align 4 +; DEFAULT-NEXT: [[T1:%.*]] = load i64, i64* [[P1]], align 4 +; DEFAULT-NEXT: [[T2:%.*]] = load i64, i64* [[P2]], align 4 +; DEFAULT-NEXT: [[T3:%.*]] = load i64, i64* [[P3]], align 4 +; DEFAULT-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]]) +; DEFAULT-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]]) +; DEFAULT-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]]) +; DEFAULT-NEXT: ret i64 [[M]] +; +; THRESH-LABEL: @umax_intrinsic_rdx_v4i64( +; THRESH-NEXT: [[P1:%.*]] = getelementptr inbounds i64, i64* [[P0:%.*]], i64 1 +; THRESH-NEXT: [[P2:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 2 +; THRESH-NEXT: [[P3:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 3 +; THRESH-NEXT: [[TMP1:%.*]] = bitcast i64* [[P0]] to <4 x i64>* +; THRESH-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* [[TMP1]], align 4 +; THRESH-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP2]]) +; THRESH-NEXT: ret i64 [[TMP3]] ; %p1 = getelementptr inbounds i64, i64* %p0, i64 1 %p2 = getelementptr inbounds i64, i64* %p0, i64 2 @@ -1155,38 +1140,10 @@ ; CHECK-NEXT: [[PD:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 ; CHECK-NEXT: [[PE:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 ; CHECK-NEXT: [[PF:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[P0]], align 4 -; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[P1]], align 4 -; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[P2]], align 4 -; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[P3]], align 4 -; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[P4]], align 4 -; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[P5]], align 4 -; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[P6]], align 4 -; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[P7]], align 4 -; CHECK-NEXT: [[T8:%.*]] = load i8, i8* [[P8]], align 4 -; CHECK-NEXT: [[T9:%.*]] = load i8, i8* [[P9]], align 4 -; CHECK-NEXT: [[TA:%.*]] = load i8, i8* [[PA]], align 4 -; CHECK-NEXT: [[TB:%.*]] = load i8, i8* [[PB]], align 4 -; CHECK-NEXT: [[TC:%.*]] = load i8, i8* [[PC]], align 4 -; CHECK-NEXT: [[TD:%.*]] = load i8, i8* [[PD]], align 4 -; CHECK-NEXT: [[TE:%.*]] = load i8, i8* [[PE]], align 4 -; CHECK-NEXT: [[TF:%.*]] = load i8, i8* [[PF]], align 4 -; CHECK-NEXT: [[M10:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T1]], i8 [[T0]]) -; CHECK-NEXT: [[M32:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T3]], i8 [[T2]]) -; CHECK-NEXT: [[M54:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T5]], i8 [[T4]]) -; CHECK-NEXT: [[M76:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T7]], i8 [[T6]]) -; CHECK-NEXT: [[M98:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T9]], i8 [[T8]]) -; CHECK-NEXT: [[MBA:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TB]], i8 [[TA]]) -; CHECK-NEXT: [[MDC:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TD]], i8 [[TC]]) -; CHECK-NEXT: [[MFE:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TF]], i8 [[TE]]) -; CHECK-NEXT: [[M3210:%.*]] = tail call i8 @llvm.umin.i8(i8 [[M32]], i8 [[M10]]) -; CHECK-NEXT: [[M7654:%.*]] = tail call i8 @llvm.umin.i8(i8 [[M76]], i8 [[M54]]) -; CHECK-NEXT: [[MDC98:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MDC]], i8 [[M98]]) -; CHECK-NEXT: [[MFEBA:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MFE]], i8 [[MBA]]) -; CHECK-NEXT: [[ML:%.*]] = tail call i8 @llvm.umin.i8(i8 [[M3210]], i8 [[M7654]]) -; CHECK-NEXT: [[MH:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MFEBA]], i8 [[MDC98]]) -; CHECK-NEXT: [[M:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MH]], i8 [[ML]]) -; CHECK-NEXT: ret i8 [[M]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[TMP2]]) +; CHECK-NEXT: ret i8 [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -1241,21 +1198,13 @@ define void @PR49730() { ; CHECK-LABEL: @PR49730( -; CHECK-NEXT: [[T:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 2) -; CHECK-NEXT: [[T1:%.*]] = sub nsw i32 undef, [[T]] -; CHECK-NEXT: [[T2:%.*]] = call i32 @llvm.umin.i32(i32 undef, i32 [[T1]]) -; CHECK-NEXT: [[T3:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 2) -; CHECK-NEXT: [[T4:%.*]] = sub nsw i32 undef, [[T3]] -; CHECK-NEXT: [[T5:%.*]] = call i32 @llvm.umin.i32(i32 [[T2]], i32 [[T4]]) -; CHECK-NEXT: [[T6:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 1) -; CHECK-NEXT: [[T7:%.*]] = sub nuw nsw i32 undef, [[T6]] -; CHECK-NEXT: [[T8:%.*]] = call i32 @llvm.umin.i32(i32 [[T5]], i32 [[T7]]) -; CHECK-NEXT: [[T9:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 1) -; CHECK-NEXT: [[T10:%.*]] = sub nsw i32 undef, [[T9]] -; CHECK-NEXT: [[T11:%.*]] = call i32 @llvm.umin.i32(i32 [[T8]], i32 [[T10]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> ) +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] ; CHECK-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[T13:%.*]] = call i32 @llvm.umin.i32(i32 [[T11]], i32 [[T12]]) -; CHECK-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[T13]], i32 93) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef) +; CHECK-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) ; CHECK-NEXT: ret void ; %t = call i32 @llvm.smin.i32(i32 undef, i32 2) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux -slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown-linux -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX @arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16 @@ -21,15 +21,20 @@ } define i32 @smax_v4i32(i32) { -; CHECK-LABEL: @smax_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]]) -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]]) -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]]) -; CHECK-NEXT: ret i32 [[TMP8]] +; SSE-LABEL: @smax_v4i32( +; SSE-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 +; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]]) +; SSE-NEXT: [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]]) +; SSE-NEXT: [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]]) +; SSE-NEXT: ret i32 [[TMP8]] +; +; AVX-LABEL: @smax_v4i32( +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr to <4 x i32>*), align 16 +; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) +; AVX-NEXT: ret i32 [[TMP3]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -43,22 +48,9 @@ define i32 @smax_v8i32(i32) { ; CHECK-LABEL: @smax_v8i32( -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]]) -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP10]], i32 [[TMP4]]) -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP11]], i32 [[TMP5]]) -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 [[TMP6]]) -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP13]], i32 [[TMP7]]) -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP14]], i32 [[TMP8]]) -; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP15]], i32 [[TMP9]]) -; CHECK-NEXT: ret i32 [[TMP16]] +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]]) +; CHECK-NEXT: ret i32 [[TMP3]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -80,38 +72,9 @@ define i32 @smax_v16i32(i32) { ; CHECK-LABEL: @smax_v16i32( -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16 -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8 -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16 -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4 -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8 -; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4 -; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]]) -; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP18]], i32 [[TMP4]]) -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP19]], i32 [[TMP5]]) -; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP20]], i32 [[TMP6]]) -; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP21]], i32 [[TMP7]]) -; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP22]], i32 [[TMP8]]) -; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP23]], i32 [[TMP9]]) -; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP24]], i32 [[TMP10]]) -; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP25]], i32 [[TMP11]]) -; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP26]], i32 [[TMP12]]) -; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP27]], i32 [[TMP13]]) -; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP28]], i32 [[TMP14]]) -; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP29]], i32 [[TMP15]]) -; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP30]], i32 [[TMP16]]) -; CHECK-NEXT: [[TMP32:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP31]], i32 [[TMP17]]) -; CHECK-NEXT: ret i32 [[TMP32]] +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]]) +; CHECK-NEXT: ret i32 [[TMP3]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll --- a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll @@ -43,19 +43,11 @@ define void @test2() { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SMIN0:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 0) -; CHECK-NEXT: [[SMIN1:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 1) -; CHECK-NEXT: [[SMIN2:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 2) -; CHECK-NEXT: [[SMIN3:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 3) -; CHECK-NEXT: [[A:%.*]] = sub nsw i32 undef, [[SMIN0]] -; CHECK-NEXT: [[B:%.*]] = sub nsw i32 undef, [[SMIN1]] -; CHECK-NEXT: [[C:%.*]] = sub nsw i32 undef, [[SMIN2]] -; CHECK-NEXT: [[D:%.*]] = sub nsw i32 undef, [[SMIN3]] -; CHECK-NEXT: [[UMIN0:%.*]] = call i32 @llvm.umin.i32(i32 [[D]], i32 [[C]]) -; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN0]], i32 [[B]]) -; CHECK-NEXT: [[UMIN2:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN1]], i32 [[A]]) -; CHECK-NEXT: [[UMIN3:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN2]], i32 77) -; CHECK-NEXT: [[E:%.*]] = icmp ugt i32 [[UMIN3]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> ) +; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <4 x i32> undef, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77) +; CHECK-NEXT: [[E:%.*]] = icmp ugt i32 [[TMP3]], 1 ; CHECK-NEXT: ret void ; entry: