Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -296,7 +296,18 @@ VectorizationFactor planInVPlanNativePath(ElementCount UserVF); /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(ElementCount VF, unsigned UF); + void setBestPlan(ElementCount VF, unsigned UF, + SmallVectorImpl *BackupPlans = nullptr); + + /// Add in extra plans, for example any that may have been saved when called + /// setBestPlan above. + void addVPlans(SmallVectorImpl &ExtraVPlans) { + for (auto &I : ExtraVPlans) + VPlans.push_back(std::unique_ptr(I.release())); + + // Since we've released the pointer we should clear the entries too. + ExtraVPlans.clear(); + } /// Generate the IR code for the body of the vectorized loop according to the /// best selected VPlan. Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6229,15 +6229,6 @@ return Result; } - // FIXME: This can be fixed for scalable vectors later, because at this stage - // the LoopVectorizer will only consider vectorizing a loop with scalable - // vectors when the loop has a hint to enable vectorization for a given VF. - if (MainLoopVF.isScalable()) { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " - "yet supported.\n"); - return Result; - } - // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { @@ -6249,9 +6240,10 @@ if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); - if (LVP.hasPlanWithVFs( - {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) - return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; + ElementCount ForcedEC = + ElementCount::getFixed(EpilogueVectorizationForceVF); + if (LVP.hasPlanWithVFs({ForcedEC})) + return {ForcedEC, 0}; else { LLVM_DEBUG( dbgs() @@ -6268,14 +6260,27 @@ return Result; } - if (!isEpilogueVectorizationProfitable(MainLoopVF)) + ElementCount FixedMainLoopVF = MainLoopVF; + if (FixedMainLoopVF.isScalable()) { + FixedMainLoopVF = + ElementCount::getFixed(FixedMainLoopVF.getKnownMinValue()); + LLVM_DEBUG( + dbgs() << "LEV: Epilogue vectorization using scalable vectors not " + "yet supported. Converting to fixed-width (VF=" + << FixedMainLoopVF << ") instead\n"); + } + + if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " + "this loop\n"); return Result; + } for (auto &NextVF : ProfitableVFs) - if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && (Result.Width.getFixedValue() == 1 || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) + LVP.hasPlanWithVFs({NextVF.Width})) Result = NextVF; if (Result != VectorizationFactor::Disabled()) @@ -8182,15 +8187,24 @@ return SelectedVF; } -void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { +void LoopVectorizationPlanner::setBestPlan( + ElementCount VF, unsigned UF, SmallVectorImpl *BackupPlans) { LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); BestVF = VF; BestUF = UF; + if (BackupPlans) { + for (auto &I : VPlans) { + if (!I.get()->hasVF(VF)) + BackupPlans->push_back(std::unique_ptr(I.release())); + } + } + erase_if(VPlans, [VF](const VPlanPtr &Plan) { - return !Plan->hasVF(VF); + return Plan.get() == nullptr || !Plan->hasVF(VF); }); + assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); } @@ -8388,7 +8402,9 @@ OldInduction = Legal->getPrimaryInduction(); Type *IdxTy = Legal->getWidestInductionType(); Value *StartIdx = ConstantInt::get(IdxTy, 0); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + + IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); + Value *Step = getRuntimeVF(B, IdxTy, VF * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); EPI.VectorTripCount = CountRoundDown; Induction = @@ -10380,7 +10396,6 @@ F->getParent()->getDataLayout()); if (!VF.Width.isScalar() || IC > 1) Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); - LVP.setBestPlan(VF.Width, IC); using namespace ore; if (!VectorizeLoop) { @@ -10389,6 +10404,7 @@ // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, BFI, PSI, Checks); + LVP.setBestPlan(VF.Width, IC); LVP.executePlan(Unroller, DT); ORE->emit([&]() { @@ -10411,8 +10427,8 @@ EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); - - LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); + SmallVector ExtraVPlans; + LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF, &ExtraVPlans); LVP.executePlan(MainILV, DT); ++LoopsVectorized; @@ -10421,6 +10437,7 @@ // Second pass vectorizes the epilogue and adjusts the control flow // edges from the first pass. + LVP.addVPlans(ExtraVPlans); LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); EPI.MainLoopVF = EPI.EpilogueVF; EPI.MainLoopUF = EPI.EpilogueUF; @@ -10435,6 +10452,7 @@ } else { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM, BFI, PSI, Checks); + LVP.setBestPlan(VF.Width, IC); LVP.executePlan(LB, DT); ++LoopsVectorized; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -0,0 +1,59 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=0 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=8 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-force-VF=8 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG-FORCED + +target triple = "aarch64-linux-gnu" + +; DEBUG: LV: Checking a loop in "f1" +; DEBUG: LEV: Epilogue vectorization using scalable vectors not yet supported. Converting to fixed-width (VF=16) instead +; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) +; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 + +; DEBUG-FORCED: LV: Checking a loop in "f1" +; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced. +; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass) +; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 + +define void @f1(i8* %A) #0 { +; CHECK-LABEL: @f1( +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[NUM_ITS:%.*]] = mul i64 [[VSCALE]], 32 +; CHECK-NEXT: [[MIN_IT_CHECK:%.*]] = icmp ult i64 1024, [[NUM_ITS]] +; CHECK-NEXT: br i1 [[MIN_IT_CHECK]], label %vec.epilog.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[VSCALE1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VEC_ITS1:%.*]] = mul i64 [[VSCALE1]], {{.*}} +; CHECK-NEXT: [[VSCALE2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VEC_ITS2:%.*]] = mul i64 [[VSCALE2]], {{.*}} +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[VEC_ITS2]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK: vector.body: +; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NXT:%.*]], %vector.body ] +; CHECK: store +; CHECK: store +; CHECK: [[IDX_NXT]] = add nuw i64 [[IDX]], [[VEC_ITS1]] +; CHECK-NEXT: {{%.*}} = icmp eq i64 [[IDX_NXT]], [[N_VEC]] +; CHECK: vec.epilog.vector.body: +; CHECK: store <8 x i8> +; CHECK: for.body: +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv + store i8 1, i8* %arrayidx, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ne i64 %iv.next, 1024 + br i1 %exitcond, label %for.body, label %exit + +exit: + ret void +} + +attributes #0 = { "target-features"="+sve" } Index: llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll +++ llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll @@ -1,11 +1,12 @@ ; REQUIRES: asserts -; RUN: opt < %s -passes='loop-vectorize' -force-vector-width=2 -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s +; RUN: opt < %s -passes='loop-vectorize' -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" ; Currently we cannot handle scalable vectorization factors. ; CHECK: LV: Checking a loop in "f1" -; CHECK: LEV: Epilogue vectorization for scalable vectors not yet supported. +; CHECK: LEV: Epilogue vectorization factor is forced. +; CHECK: Epilogue Loop VF:2, Epilogue Loop UF:1 define void @f1(i8* %A) { entry: