diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7638,6 +7638,9 @@ LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n'); + if (!IsEpilogueVectorization) + VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); + // Perform the actual loop transformation. // 1. Set up the skeleton for vectorization, including vector pre-header and diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2591,6 +2591,11 @@ void addVF(ElementCount VF) { VFs.insert(VF); } + void setVF(ElementCount VF) { + VFs.clear(); + VFs.insert(VF); + } + bool hasVF(ElementCount VF) { return VFs.count(VF); } bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -585,45 +585,11 @@ return nullptr; } -static bool canSimplifyBranchOnCond(VPInstruction *Term) { - VPInstruction *Not = dyn_cast(Term->getOperand(0)); - if (!Not || Not->getOpcode() != VPInstruction::Not) - return false; - - VPInstruction *ALM = dyn_cast(Not->getOperand(0)); - return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask; -} - void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, Value *CanonicalIVStartValue, VPTransformState &State, bool IsEpilogueVectorization) { - VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock(); - auto *Term = dyn_cast(&ExitingVPBB->back()); - // Try to simplify the branch condition if TC <= VF * UF when preparing to - // execute the plan for the main vector loop. We only do this if the - // terminator is: - // 1. BranchOnCount, or - // 2. BranchOnCond where the input is Not(ActiveLaneMask). - if (!IsEpilogueVectorization && Term && isa(TripCountV) && - (Term->getOpcode() == VPInstruction::BranchOnCount || - (Term->getOpcode() == VPInstruction::BranchOnCond && - canSimplifyBranchOnCond(Term)))) { - ConstantInt *C = cast(TripCountV); - uint64_t TCVal = C->getZExtValue(); - if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) { - auto *BOC = - new VPInstruction(VPInstruction::BranchOnCond, - {getOrAddExternalDef(State.Builder.getTrue())}); - Term->eraseFromParent(); - ExitingVPBB->appendRecipe(BOC); - // TODO: Further simplifications are possible - // 1. Replace inductions with constants. - // 2. Replace vector loop region with VPBasicBlock. - } - } - // Check if the trip count is needed, and if so build it. if (TripCount && TripCount->getNumUsers()) { for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -23,6 +23,7 @@ class PHINode; class ScalarEvolution; class Loop; +class PredicatedScalarEvolution; class TargetLibraryInfo; struct VPlanTransforms { @@ -62,6 +63,12 @@ /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing /// them with already existing recipes expanding the same SCEV expression. static void removeRedundantExpandSCEVRecipes(VPlan &Plan); + + /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the + /// resulting plan to \p BestVF and \p BestUF. + static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, + unsigned BestUF, + PredicatedScalarEvolution &PSE); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -451,3 +451,50 @@ ExpR->eraseFromParent(); } } + +static bool canSimplifyBranchOnCond(VPInstruction *Term) { + VPInstruction *Not = dyn_cast(Term->getOperand(0)); + if (!Not || Not->getOpcode() != VPInstruction::Not) + return false; + + VPInstruction *ALM = dyn_cast(Not->getOperand(0)); + return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask; +} + +void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, + unsigned BestUF, + PredicatedScalarEvolution &PSE) { + assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); + VPBasicBlock *ExitingVPBB = + Plan.getVectorLoopRegion()->getExitingBasicBlock(); + auto *Term = dyn_cast(&ExitingVPBB->back()); + // Try to simplify the branch condition if BTC < VF * UF when preparing to + // execute the plan for the main vector loop. We only do this if the + // terminator is: + // 1. BranchOnCount, or + // 2. BranchOnCond where the input is Not(ActiveLaneMask). + if (!Term || (Term->getOpcode() != VPInstruction::BranchOnCount && + (Term->getOpcode() != VPInstruction::BranchOnCond || + !canSimplifyBranchOnCond(Term)))) + return; + + Type *IdxTy = + Plan.getCanonicalIV()->getStartValue()->getLiveInIRValue()->getType(); + const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE); + auto *C = dyn_cast(ExitCount); + ScalarEvolution &SE = *PSE.getSE(); + if (!C || ExitCount->isZero() || + C->getAPInt().getZExtValue() > BestVF.getKnownMinValue() * BestUF) + return; + + LLVMContext &Ctx = SE.getContext(); + auto *BOC = + new VPInstruction(VPInstruction::BranchOnCond, + {Plan.getOrAddExternalDef(ConstantInt::getTrue(Ctx))}); + Term->eraseFromParent(); + ExitingVPBB->appendRecipe(BOC); + Plan.setVF(BestVF); + // TODO: Further simplifications are possible + // 1. Replace inductions with constants. + // 2. Replace vector loop region with VPBasicBlock. +} diff --git a/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll b/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll --- a/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll +++ b/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll @@ -13,7 +13,8 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1, 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_I_I_I:%.*]], label [[SCALAR_PH]]