diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8795,13 +8795,17 @@ return VPBB; } LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); - assert(VPBB->getSuccessors().empty() && - "VPBB has successors when handling predicated replication."); + + VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); + assert(SingleSucc && "VPBB must have a single successor when handling " + "predicated replication."); + VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); // Record predicated instructions for above packing optimizations. VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); VPBlockUtils::insertBlockAfter(Region, VPBB); auto *RegSucc = new VPBasicBlock(); VPBlockUtils::insertBlockAfter(RegSucc, Region); + VPBlockUtils::connectBlocks(RegSucc, SingleSucc); return RegSucc; } @@ -9022,30 +9026,25 @@ // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - auto Plan = std::make_unique(); + // Create initial VPlan skeleton, with separate header and latch blocks. + VPBasicBlock *HeaderVPBB = new VPBasicBlock(); + VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); + VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); + auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); + auto Plan = std::make_unique(TopRegion); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); - VPBasicBlock *VPBB = nullptr; - VPBasicBlock *HeaderVPBB = nullptr; + VPBasicBlock *VPBB = HeaderVPBB; SmallVector InductionsToMove; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. unsigned VPBBsForBB = 0; - auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); - if (VPBB) - VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); - else { - auto *TopRegion = new VPRegionBlock("vector loop"); - TopRegion->setEntry(FirstVPBBForBB); - Plan->setEntry(TopRegion); - HeaderVPBB = FirstVPBBForBB; - } - VPBB = FirstVPBBForBB; + VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. @@ -9112,8 +9111,17 @@ : ""); } } + + VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); + VPBB = cast(VPBB->getSingleSuccessor()); } + // Fold the last, empty block into its predecessor. + VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); + assert(VPBB && "expected to fold last (empty) block"); + // After here, VPBB should not be used. + VPBB = nullptr; + assert(isa(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " @@ -9183,13 +9191,9 @@ VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); VPBlockUtils::connectBlocks(SplitPred, SinkRegion); VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); - if (VPBB == SplitPred) - VPBB = SplitBlock; } } - cast(Plan->getEntry())->setExit(VPBB); - VPlanTransforms::removeRedundantInductionCasts(*Plan); // Now that sink-after is done, move induction recipes for optimized truncates @@ -9198,7 +9202,8 @@ Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); // Adjust the recipes for any inloop reductions. - adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); + adjustRecipesForReductions(cast(TopRegion->getExit()), Plan, + RecipeBuilder, Range.Start); // Introduce a recipe to combine the incoming and previous values of a // first-order recurrence. @@ -9278,6 +9283,11 @@ RSO.flush(); Plan->setName(PlanName); + // Fold Exit block into its predecessor if possible. + // TODO: Fold block earlier once all VPlan transforms properly maintain a + // VPBasicBlock as exit. + VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); + assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2352,18 +2352,23 @@ /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p - /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr - /// has more than one successor, its conditional bit is propagated to \p - /// NewBlock. \p NewBlock must have neither successors nor predecessors. + /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's + /// successors are moved from \p BlockPtr to \p NewBlock and \p BlockPtr's + /// conditional bit is propagated to \p NewBlock. \p NewBlock must have + /// neither successors nor predecessors. static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { assert(NewBlock->getSuccessors().empty() && - "Can't insert new block with successors."); - // TODO: move successors from BlockPtr to NewBlock when this functionality - // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr - // already has successors. - BlockPtr->setOneSuccessor(NewBlock); - NewBlock->setPredecessors({BlockPtr}); + NewBlock->getPredecessors().empty() && + "Can't insert new block with predecessors or successors."); NewBlock->setParent(BlockPtr->getParent()); + SmallVector Succs(BlockPtr->successors()); + for (VPBlockBase *Succ : Succs) { + disconnectBlocks(BlockPtr, Succ); + connectBlocks(NewBlock, Succ); + } + NewBlock->setCondBit(BlockPtr->getCondBit()); + BlockPtr->setCondBit(nullptr); + connectBlocks(BlockPtr, NewBlock); } /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p @@ -2406,6 +2411,31 @@ To->removePredecessor(From); } + /// Try to merge \p Block into its single predecessor, if \p Block is a + /// VPBasicBlock and its predecessor has a single successor. Returns a pointer + /// to the predecessor \p Block was merged into or nullptr otherwise. + static VPBasicBlock *tryToMergeBlockIntoPredecessor(VPBlockBase *Block) { + auto *VPBB = dyn_cast(Block); + auto *PredVPBB = + dyn_cast_or_null(Block->getSinglePredecessor()); + if (!VPBB || !PredVPBB || PredVPBB->getNumSuccessors() != 1) + return nullptr; + + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) + R.moveBefore(*PredVPBB, PredVPBB->end()); + VPBlockUtils::disconnectBlocks(PredVPBB, VPBB); + auto *ParentRegion = cast(Block->getParent()); + if (ParentRegion->getExit() == Block) + ParentRegion->setExit(PredVPBB); + SmallVector Successors(Block->successors()); + for (auto *Succ : Successors) { + VPBlockUtils::disconnectBlocks(Block, Succ); + VPBlockUtils::connectBlocks(PredVPBB, Succ); + } + delete Block; + return PredVPBB; + } + /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge. static bool isBackEdge(const VPBlockBase *FromBlock, const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) { diff --git a/llvm/test/Transforms/LoopVectorize/reduction-order.ll b/llvm/test/Transforms/LoopVectorize/reduction-order.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-order.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-order.ll @@ -7,9 +7,9 @@ ; in deterministic order. ; CHECK-LABEL: @foo( ; CHECK: vector.body: -; CHECK: icmp ule <4 x i64> -; CHECK-NEXT: %[[VAR1:.*]] = add <4 x i32> , %vec.phi1 +; CHECK: %[[VAR1:.*]] = add <4 x i32> , %vec.phi1 ; CHECK-NEXT: %[[VAR2:.*]] = add <4 x i32> %vec.phi, +; CHECK-NEXT: icmp ule <4 x i64> ; CHECK-NEXT: select <4 x i1> {{.*}}, <4 x i32> %[[VAR2]], <4 x i32> ; CHECK-NEXT: select <4 x i1> {{.*}}, <4 x i32> %[[VAR1]], <4 x i32> ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll @@ -33,9 +33,9 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT4]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], ; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]