Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6229,15 +6229,6 @@ return Result; } - // FIXME: This can be fixed for scalable vectors later, because at this stage - // the LoopVectorizer will only consider vectorizing a loop with scalable - // vectors when the loop has a hint to enable vectorization for a given VF. - if (MainLoopVF.isScalable()) { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " - "yet supported.\n"); - return Result; - } - // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { @@ -6250,7 +6241,7 @@ if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); - if (LVP.hasPlanWithVFs({MainLoopVF, ForcedEC})) + if (LVP.hasPlanWithVFs({ForcedEC})) return {ForcedEC, 0}; else { LLVM_DEBUG( @@ -6268,14 +6259,24 @@ return Result; } - if (!isEpilogueVectorizationProfitable(MainLoopVF)) + auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); + if (MainLoopVF.isScalable()) + LLVM_DEBUG( + dbgs() << "LEV: Epilogue vectorization using scalable vectors not " + "yet supported. Converting to fixed-width (VF=" + << FixedMainLoopVF << ") instead\n"); + + if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " + "this loop\n"); return Result; + } for (auto &NextVF : ProfitableVFs) - if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && (Result.Width.getFixedValue() == 1 || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) + LVP.hasPlanWithVFs({NextVF.Width})) Result = NextVF; if (Result != VectorizationFactor::Disabled()) @@ -8393,7 +8394,9 @@ OldInduction = Legal->getPrimaryInduction(); Type *IdxTy = Legal->getWidestInductionType(); Value *StartIdx = ConstantInt::get(IdxTy, 0); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + + IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); + Value *Step = getRuntimeVF(B, IdxTy, VF * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); EPI.VectorTripCount = CountRoundDown; Induction = @@ -10385,7 +10388,6 @@ F->getParent()->getDataLayout()); if (!VF.Width.isScalar() || IC > 1) Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); - VPlanPtr &BestPlan = LVP.getBestPlanFor(VF.Width, IC); using namespace ore; if (!VectorizeLoop) { @@ -10394,6 +10396,7 @@ // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, BFI, PSI, Checks); + VPlanPtr &BestPlan = LVP.getBestPlanFor(VF.Width, IC); LVP.executePlan(BestPlan, Unroller, DT); ORE->emit([&]() { @@ -10440,6 +10443,7 @@ } else { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM, BFI, PSI, Checks); + VPlanPtr &BestPlan = LVP.getBestPlanFor(VF.Width, IC); LVP.executePlan(BestPlan, LB, DT); ++LoopsVectorized; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -0,0 +1,59 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=0 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=8 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-force-VF=8 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG-FORCED + +target triple = "aarch64-linux-gnu" + +; DEBUG: LV: Checking a loop in "f1" +; DEBUG: LEV: Epilogue vectorization using scalable vectors not yet supported. Converting to fixed-width (VF=16) instead +; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) +; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 + +; DEBUG-FORCED: LV: Checking a loop in "f1" +; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced. +; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass) +; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 + +define void @f1(i8* %A) #0 { +; CHECK-LABEL: @f1( +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[NUM_ITS:%.*]] = mul i64 [[VSCALE]], 32 +; CHECK-NEXT: [[MIN_IT_CHECK:%.*]] = icmp ult i64 1024, [[NUM_ITS]] +; CHECK-NEXT: br i1 [[MIN_IT_CHECK]], label %vec.epilog.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[VSCALE1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VEC_ITS1:%.*]] = mul i64 [[VSCALE1]], {{.*}} +; CHECK-NEXT: [[VSCALE2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VEC_ITS2:%.*]] = mul i64 [[VSCALE2]], {{.*}} +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[VEC_ITS2]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK: vector.body: +; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NXT:%.*]], %vector.body ] +; CHECK: store +; CHECK: store +; CHECK: [[IDX_NXT]] = add nuw i64 [[IDX]], [[VEC_ITS1]] +; CHECK-NEXT: {{%.*}} = icmp eq i64 [[IDX_NXT]], [[N_VEC]] +; CHECK: vec.epilog.vector.body: +; CHECK: store <8 x i8> +; CHECK: for.body: +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv + store i8 1, i8* %arrayidx, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ne i64 %iv.next, 1024 + br i1 %exitcond, label %for.body, label %exit + +exit: + ret void +} + +attributes #0 = { "target-features"="+sve" } Index: llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll +++ llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll @@ -1,11 +1,12 @@ ; REQUIRES: asserts -; RUN: opt < %s -passes='loop-vectorize' -force-vector-width=2 -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s +; RUN: opt < %s -passes='loop-vectorize' -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" ; Currently we cannot handle scalable vectorization factors. ; CHECK: LV: Checking a loop in "f1" -; CHECK: LEV: Epilogue vectorization for scalable vectors not yet supported. +; CHECK: LEV: Epilogue vectorization factor is forced. +; CHECK: Epilogue Loop VF:2, Epilogue Loop UF:1 define void @f1(i8* %A) { entry: