diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -203,9 +203,6 @@ /// Loop Info analysis. LoopInfo *LI; - /// Target Library Info. - const TargetLibraryInfo *TLI; - /// Target Transform Info. const TargetTransformInfo *TTI; @@ -232,13 +229,13 @@ unsigned BestUF = 0; public: - LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, + LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI, PredicatedScalarEvolution &PSE) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), + : OrigLoop(L), LI(LI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), PSE(PSE) {} /// Plan how to best vectorize, return the best VF and its cost, or None if @@ -301,6 +298,12 @@ VFRange &Range, SmallPtrSetImpl &DeadInstructions, const DenseMap &SinkAfter); + /// Compute a list of vector factor ranges so that the decisions for each + /// instruction matches across all vector factors in the range. + SmallVector + computeRanges(ElementCount MinVF, ElementCount MaxVF, + SmallPtrSetImpl &DeadInstructions); + /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1593,6 +1593,49 @@ Scalars.clear(); } + /// Returns true if \p CI should be widened for \p VF. + bool shouldWidenCall(CallInst *CI, ElementCount VF) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || + ID == Intrinsic::pseudoprobe || + ID == Intrinsic::experimental_noalias_scope_decl)) + return false; + + // The following case may be scalarized depending on the VF. + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize = false; + InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost IntrinsicCost = ID ? getVectorIntrinsicCost(CI, VF) : 0; + bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; + assert(IntrinsicCost.isValid() && CallCost.isValid() && + "Cannot have invalid costs while widening"); + return UseVectorIntrinsic || !NeedToScalarize; + } + + /// Returns true if memory instruction \p I should be widened for \p VF. + bool shouldWidenMemory(Instruction *I, ElementCount VF) { + if (VF.isScalar()) + return false; + LoopVectorizationCostModel::InstWidening Decision = + getWideningDecision(I, VF); + assert(Decision != LoopVectorizationCostModel::CM_Unknown && + "CM decision should be taken at this point."); + if (Decision == LoopVectorizationCostModel::CM_Interleave) + return true; + if (isScalarAfterVectorization(I, VF) || isProfitableToScalarize(I, VF)) + return false; + return Decision != LoopVectorizationCostModel::CM_Scalarize; + } + + /// Returns true if instruction \p I will be should be for \p VF. + bool shouldScalarize(Instruction *I, ElementCount VF) { + return isScalarAfterVectorization(I, VF) || + isProfitableToScalarize(I, VF) || isScalarWithPredication(I, VF); + } + private: unsigned NumPredStores = 0; @@ -8326,18 +8369,7 @@ "Must be called with either a load or store"); auto willWiden = [&](ElementCount VF) -> bool { - if (VF.isScalar()) - return false; - LoopVectorizationCostModel::InstWidening Decision = - CM.getWideningDecision(I, VF); - assert(Decision != LoopVectorizationCostModel::CM_Unknown && - "CM decision should be taken at this point."); - if (Decision == LoopVectorizationCostModel::CM_Interleave) - return true; - if (CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF)) - return false; - return Decision != LoopVectorizationCostModel::CM_Scalarize; + return CM.shouldWidenMemory(I, VF); }; if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) @@ -8443,26 +8475,8 @@ if (IsPredicated) return nullptr; - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || - ID == Intrinsic::pseudoprobe || - ID == Intrinsic::experimental_noalias_scope_decl)) - return nullptr; - auto willWiden = [&](ElementCount VF) -> bool { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - // The following case may be scalarized depending on the VF. - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize = false; - InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); - InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; - bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; - assert(IntrinsicCost.isValid() && CallCost.isValid() && - "Cannot have invalid costs while widening"); - return UseVectorIntrinsic || !NeedToScalarize; + return CM.shouldWidenCall(CI, VF); }; if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) @@ -8476,12 +8490,10 @@ !isa(I) && "Instruction should have been handled earlier"); // Instruction should be widened, unless it is scalar after vectorization, // scalarization is profitable or it is predicated. - auto WillScalarize = [this, I](ElementCount VF) -> bool { - return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF) || - CM.isScalarWithPredication(I, VF); + auto ShouldScalarize = [this, I](ElementCount VF) -> bool { + return CM.shouldScalarize(I, VF); }; - return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, + return !LoopVectorizationPlanner::getDecisionAndClampRange(ShouldScalarize, Range); } @@ -8663,6 +8675,103 @@ return toVPRecipeResult(tryToWiden(Instr, *Plan)); } +SmallVector LoopVectorizationPlanner::computeRanges( + ElementCount MinVF, ElementCount MaxVF, + SmallPtrSetImpl &DeadInstructions) { + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + SmallVector Ranges; + Ranges.emplace_back(MinVF, MaxVF.getWithIncrement(1)); + + auto SplitRange = [&Ranges](unsigned Idx, ElementCount VF) { + auto &Range = Ranges[Idx]; + VFRange NewR(VF * 2, Range.End); + Range.End = VF * 2; + if (Idx + 1 == Ranges.size()) + Ranges.push_back(NewR); + else + Ranges.insert(&Ranges[Idx + 1], NewR); + }; + + for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { + auto applyIG = [IG, this](ElementCount VF) -> bool { + return (VF.isVector() && // Query is illegal for VF == 1 + CM.getWideningDecision(IG->getInsertPos(), VF) == + LoopVectorizationCostModel::CM_Interleave); + }; + for (unsigned Idx = 0; Idx != Ranges.size(); ++Idx) { + VFRange &Range = Ranges[Idx]; + for (ElementCount VF = Range.Start; + ElementCount::isKnownLT(VF * 2, Range.End); VF *= 2) { + if (applyIG(VF) != applyIG(VF * 2)) { + SplitRange(Idx, VF); + break; + } + } + } + } + + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { + for (Instruction &I : BB->instructionsWithoutDebug()) { + Instruction *Instr = &I; + + // Skip irrelevant instructions. + if (isa(Instr) || DeadInstructions.count(Instr)) + continue; + + // Inductions are always widened. + if (auto *Phi = dyn_cast(Instr)) { + InductionDescriptor II = Legal->getInductionVars().lookup(Phi); + if (II.getKind() == InductionDescriptor::IK_IntInduction || + II.getKind() == InductionDescriptor::IK_FpInduction) { + continue; + } + } + + // Check if the same decision applies for each VF for each of the current + // ranges. If the same decision cannot be applied to all VFs in a range, + // split the range at the point where the decisions diverge. + for (unsigned Idx = 0; Idx != Ranges.size(); ++Idx) { + VFRange &Range = Ranges[Idx]; + for (ElementCount VF = Range.Start; + ElementCount::isKnownLT(VF * 2, Range.End); VF *= 2) { + bool Split = false; + bool Widened = false; + if (CM.isOptimizableIVTruncate(&I, VF)) { + Split |= CM.isOptimizableIVTruncate(&I, VF) != + CM.isOptimizableIVTruncate(&I, VF * 2); + Widened = true; + } else if (auto *CI = dyn_cast(Instr)) { + Split |= + CM.shouldWidenCall(CI, VF) != CM.shouldWidenCall(CI, VF * 2); + Widened |= CM.shouldWidenCall(CI, VF); + } else if (isa(&I) || isa(&I)) { + Split |= + CM.shouldWidenMemory(&I, VF) != CM.shouldWidenMemory(&I, VF); + Widened |= CM.shouldWidenMemory(&I, VF); + } + // If we did not already decided to widen, check if the scalarization + // decisions agree. + if (!Widened) { + Split |= + CM.shouldScalarize(&I, VF) != CM.shouldScalarize(&I, VF * 2); + Split |= CM.isUniformAfterVectorization(&I, VF) != + CM.isUniformAfterVectorization(&I, VF * 2); + } + + // Split the current range into (Range.Start, VF * 2), (VF * 2, + // Range.End). Continue processing the second (newly added) range. + if (Split) { + SplitRange(Idx, VF); + break; + } + } + } + } + } + return Ranges; +} + void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -8688,13 +8797,21 @@ for (Instruction *I : DeadInstructions) SinkAfter.erase(I); + auto Ranges = computeRanges(MinVF, MaxVF, DeadInstructions); auto MaxVFPlusOne = MaxVF.getWithIncrement(1); + unsigned I = 0; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; VPlans.push_back( buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); VF = SubRange.End; + assert(Ranges[I].Start == SubRange.Start && Ranges[I].End == SubRange.End && + "clamped range should match computed range"); + I++; } + + assert(Ranges.size() == VPlans.size() && + "should have the same number of clamped and computed ranges"); } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( @@ -8703,7 +8820,7 @@ SmallPtrSet *, 1> InterleaveGroups; - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); + VPRecipeBuilder RecipeBuilder(OrigLoop, Legal, CM, PSE, Builder); // --------------------------------------------------------------------------- // Pre-construction: record ingredients whose recipes we'll need to further @@ -9383,7 +9500,7 @@ // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); + LoopVectorizationPlanner LVP(L, LI, TTI, LVL, CM, IAI, PSE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -9596,7 +9713,7 @@ CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); + LoopVectorizationPlanner LVP(L, LI, TTI, &LVL, CM, IAI, PSE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -28,9 +28,6 @@ /// The loop that we evaluate. Loop *OrigLoop; - /// Target Library Info. - const TargetLibraryInfo *TLI; - /// The legality analysis. LoopVectorizationLegality *Legal; @@ -99,12 +96,10 @@ VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; } public: - VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, - LoopVectorizationLegality *Legal, + VPRecipeBuilder(Loop *OrigLoop, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, PredicatedScalarEvolution &PSE, VPBuilder &Builder) - : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), PSE(PSE), - Builder(Builder) {} + : OrigLoop(OrigLoop), Legal(Legal), CM(CM), PSE(PSE), Builder(Builder) {} /// Check if an existing VPValue can be used for \p Instr or a recipe can be /// create for \p I withing the given VF \p Range. If an existing VPValue can diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -70,7 +70,7 @@ /// [1, 9) = {1, 2, 4, 8} struct VFRange { // A power of 2. - const ElementCount Start; + ElementCount Start; // Need not be a power of 2. If End <= Start range is empty. ElementCount End; diff --git a/llvm/test/Transforms/LoopVectorize/X86/phis-only.ll b/llvm/test/Transforms/LoopVectorize/X86/phis-only.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/phis-only.ll @@ -0,0 +1,22 @@ +; RUN: opt -loop-vectorize -S %s | FileCheck %s + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx" + +; CHECK: vector.body: +define void @test() { +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %iv.2 = phi i64 [ %iv.2.next, %loop ], [ 0, %entry ] + %0 = trunc i64 %iv to i32 + %iv.next = add nuw nsw i64 %iv, 1 + %iv.2.next = add nsw i64 %iv.2, -1 + %ec = icmp eq i64 %iv.2.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + ret void +}