Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -303,12 +303,9 @@ /// Look through the existing plans and return true if we have one with all /// the vectorization factors in question. - bool hasPlanWithVFs(const ArrayRef VFs) const { - return any_of(VPlans, [&](const VPlanPtr &Plan) { - return all_of(VFs, [&](const ElementCount &VF) { - return Plan->hasVF(VF); - }); - }); + bool hasPlanWithVF(ElementCount VF) const { + return any_of(VPlans, + [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); }); } /// Remove all VPlans except those containing one of the \p VFs. Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6229,15 +6229,6 @@ return Result; } - // FIXME: This can be fixed for scalable vectors later, because at this stage - // the LoopVectorizer will only consider vectorizing a loop with scalable - // vectors when the loop has a hint to enable vectorization for a given VF. - if (MainLoopVF.isScalable()) { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " - "yet supported.\n"); - return Result; - } - // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { @@ -6250,7 +6241,7 @@ if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); - if (LVP.hasPlanWithVFs({MainLoopVF, ForcedEC})) + if (LVP.hasPlanWithVF(ForcedEC)) return {ForcedEC, 0}; else { LLVM_DEBUG( @@ -6268,14 +6259,24 @@ return Result; } - if (!isEpilogueVectorizationProfitable(MainLoopVF)) + auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); + if (MainLoopVF.isScalable()) + LLVM_DEBUG( + dbgs() << "LEV: Epilogue vectorization using scalable vectors not " + "yet supported. Converting to fixed-width (VF=" + << FixedMainLoopVF << ") instead\n"); + + if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " + "this loop\n"); return Result; + } for (auto &NextVF : ProfitableVFs) - if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && (Result.Width.getFixedValue() == 1 || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) + LVP.hasPlanWithVF(NextVF.Width)) Result = NextVF; if (Result != VectorizationFactor::Disabled()) @@ -8391,7 +8392,9 @@ OldInduction = Legal->getPrimaryInduction(); Type *IdxTy = Legal->getWidestInductionType(); Value *StartIdx = ConstantInt::get(IdxTy, 0); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + + IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); + Value *Step = getRuntimeVF(B, IdxTy, VF * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); EPI.VectorTripCount = CountRoundDown; Induction = @@ -10384,8 +10387,6 @@ F->getParent()->getDataLayout()); if (!VF.Width.isScalar() || IC > 1) Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); - LVP.removePlansExcept({VF.Width}); - const VPlanPtr &BestPlan = LVP.getBestPlanFor(VF.Width); using namespace ore; if (!VectorizeLoop) { @@ -10394,6 +10395,10 @@ // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, BFI, PSI, Checks); + + LVP.removePlansExcept({VF.Width}); + + const VPlanPtr &BestPlan = LVP.getBestPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); ORE->emit([&]() { @@ -10417,7 +10422,11 @@ EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); - LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestPlan, MainILV, + LVP.removePlansExcept({EPI.MainLoopVF, EPI.EpilogueVF}); + const VPlanPtr &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); + const VPlanPtr &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); + + LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT); ++LoopsVectorized; @@ -10431,7 +10440,7 @@ EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); - LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestPlan, EpilogILV, + LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT); ++LoopsEpilogueVectorized; @@ -10440,6 +10449,10 @@ } else { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM, BFI, PSI, Checks); + + LVP.removePlansExcept({VF.Width}); + + const VPlanPtr &BestPlan = LVP.getBestPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); ++LoopsVectorized; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -0,0 +1,59 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=0 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=8 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-force-VF=8 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG-FORCED + +target triple = "aarch64-linux-gnu" + +; DEBUG: LV: Checking a loop in "f1" +; DEBUG: LEV: Epilogue vectorization using scalable vectors not yet supported. Converting to fixed-width (VF=16) instead +; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) +; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 + +; DEBUG-FORCED: LV: Checking a loop in "f1" +; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced. +; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass) +; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 + +define void @f1(i8* %A) #0 { +; CHECK-LABEL: @f1( +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[NUM_ITS:%.*]] = mul i64 [[VSCALE]], 32 +; CHECK-NEXT: [[MIN_IT_CHECK:%.*]] = icmp ult i64 1024, [[NUM_ITS]] +; CHECK-NEXT: br i1 [[MIN_IT_CHECK]], label %vec.epilog.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[VSCALE1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VEC_ITS1:%.*]] = mul i64 [[VSCALE1]], {{.*}} +; CHECK-NEXT: [[VSCALE2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VEC_ITS2:%.*]] = mul i64 [[VSCALE2]], {{.*}} +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[VEC_ITS2]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK: vector.body: +; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NXT:%.*]], %vector.body ] +; CHECK: store +; CHECK: store +; CHECK: [[IDX_NXT]] = add nuw i64 [[IDX]], [[VEC_ITS1]] +; CHECK-NEXT: {{%.*}} = icmp eq i64 [[IDX_NXT]], [[N_VEC]] +; CHECK: vec.epilog.vector.body: +; CHECK: store <8 x i8> +; CHECK: for.body: +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv + store i8 1, i8* %arrayidx, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ne i64 %iv.next, 1024 + br i1 %exitcond, label %for.body, label %exit + +exit: + ret void +} + +attributes #0 = { "target-features"="+sve" } Index: llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll +++ llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll @@ -1,11 +1,12 @@ ; REQUIRES: asserts -; RUN: opt < %s -passes='loop-vectorize' -force-vector-width=2 -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s +; RUN: opt < %s -passes='loop-vectorize' -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" ; Currently we cannot handle scalable vectorization factors. ; CHECK: LV: Checking a loop in "f1" -; CHECK: LEV: Epilogue vectorization for scalable vectors not yet supported. +; CHECK: LEV: Epilogue vectorization factor is forced. +; CHECK: Epilogue Loop VF:2, Epilogue Loop UF:1 define void @f1(i8* %A) { entry: