diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -221,8 +221,10 @@ LoopVectorizationCostModel &CM) : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {} - /// Plan how to best vectorize, return the best VF and its cost. - VectorizationFactor plan(bool OptForSize, unsigned UserVF); + /// Plan how to best vectorize, return the best VF and its cost, together with + /// the interleave count. + std::pair + plan(bool OptForSize, unsigned UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -842,7 +842,7 @@ : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} - /// \return An upper bound for the vectorization factor, or None if + /// \return An upper bound for the vectorization factor, or 1 if /// vectorization should be avoided up front. Optional computeMaxVF(bool OptForSize); @@ -6112,56 +6112,68 @@ return NoVectorization; } -VectorizationFactor -LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { +std::pair +LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF, + unsigned UserIC) { assert(OrigLoop->empty() && "Inner loop expected."); - // Width 1 means no vectorization, cost 0 means uncomputed cost. - const VectorizationFactor NoVectorization = {1U, 0U}; + + // Assume no vectorization until proven otherwise. Width 1 means no + // vectorization, cost 0 means uncomputed cost. + VectorizationFactor VF = {1U, 0U}; + + // Start with pessimistic MaxVF and extend it if MaybeMaxVF is valid. + unsigned MaxVF = 1; + Optional MaybeMaxVF = CM.computeMaxVF(OptForSize); - if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize. - return NoVectorization; - // Invalidate interleave groups if all blocks of loop will be predicated. - if (CM.blockNeedsPredication(OrigLoop->getHeader()) && - !useMaskedInterleavedAccesses(*TTI)) { - LLVM_DEBUG( - dbgs() - << "LV: Invalidate all interleaved groups due to fold-tail by masking " - "which requires masked-interleaved support.\n"); - CM.InterleaveInfo.reset(); - } + if (MaybeMaxVF) { + // Invalidate interleave groups if all blocks of loop will be predicated. + if (CM.blockNeedsPredication(OrigLoop->getHeader()) && + !useMaskedInterleavedAccesses(*TTI)) { + LLVM_DEBUG(dbgs() << "LV: Invalidate all interleaved groups due to " + "fold-tail by masking which requires " + "masked-interleaved support.\n"); + CM.InterleaveInfo.reset(); + } - if (UserVF) { - LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); - // Collect the instructions (and their associated costs) that will be more - // profitable to scalarize. - CM.selectUserVectorizationFactor(UserVF); - buildVPlansWithVPRecipes(UserVF, UserVF); - LLVM_DEBUG(printPlans(dbgs())); - return {UserVF, 0}; - } + if (UserVF) { + LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); + assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + CM.selectUserVectorizationFactor(UserVF); + buildVPlansWithVPRecipes(UserVF, UserVF); + LLVM_DEBUG(printPlans(dbgs())); + return {{UserVF, 0}, CM.selectInterleaveCount(OptForSize, UserVF, 0)}; + } - unsigned MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF != 0 && "MaxVF is zero."); + MaxVF = *MaybeMaxVF; + assert(MaxVF != 0 && "MaxVF is zero."); - for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { - // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(VF); + if (MaxVF > 1) { + CM.collectUniformsAndScalars(1); + for (unsigned VF = 2; VF <= MaxVF; VF *= 2) { + // Collect Uniform and Scalar instructions after vectorization with VF. + CM.collectUniformsAndScalars(VF); - // Collect the instructions (and their associated costs) that will be more - // profitable to scalarize. - if (VF > 1) - CM.collectInstsToScalarize(VF); + // Collect the instructions (and their associated costs) that will be + // more profitable to scalarize. + CM.collectInstsToScalarize(VF); + } + // Select the optimal vectorization factor. + VF = CM.selectVectorizationFactor(MaxVF); + } } - buildVPlansWithVPRecipes(1, MaxVF); - LLVM_DEBUG(printPlans(dbgs())); - if (MaxVF == 1) - return NoVectorization; + unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); - // Select the optimal vectorization factor. - return CM.selectVectorizationFactor(MaxVF); + // Build VPlans if we either have at least possible vectorization factor + // (MaxVF > 1) or decided to interleave. + if (MaxVF > 1 || UserIC > 1 || IC > 1) { + buildVPlansWithVPRecipes(1, MaxVF); + LLVM_DEBUG(printPlans(dbgs())); + } + return {VF, IC}; } void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { @@ -7322,15 +7334,14 @@ // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); - // Plan how to best vectorize, return the best VF and its cost. - VectorizationFactor VF = LVP.plan(OptForSize, UserVF); - - // Select the interleave count. - unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); - // Get user interleave count. unsigned UserIC = Hints.getInterleave(); + // Plan how to best vectorize, return the best VF and its cost. + VectorizationFactor VF; + unsigned IC; + std::tie(VF, IC) = LVP.plan(OptForSize, UserVF, UserIC); + // Identify the diagnostic messages that should be produced. std::pair VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; diff --git a/llvm/test/Transforms/LoopVectorize/interleave-need-vplan.ll b/llvm/test/Transforms/LoopVectorize/interleave-need-vplan.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/interleave-need-vplan.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @inc(i32 %n) #0 { +; CHECK-LABEL: @inc( +; CHECK-NEXT: br i1 false, label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]] +; CHECK: .lr.ph.preheader: +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1, 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ] +; CHECK-NEXT: br label [[DOTLR_PH:%.*]] +; CHECK: .lr.ph: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !3 +; CHECK: ._crit_edge.loopexit: +; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] +; CHECK: ._crit_edge: +; CHECK-NEXT: ret void +; + br i1 false, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %.lr.ph, %0 + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %indvars.iv.next = add i64 %indvars.iv, 1 + br i1 true, label %._crit_edge, label %.lr.ph, !llvm.loop !0 + +._crit_edge: ; preds = %.lr.ph, %0 + ret void +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.interleave.count", i32 2}