diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8148,6 +8148,10 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + // For the loop header return the cached block mask. + if (OrigLoop->getHeader() == BB) + return BlockMaskCache[BB]; + // Look for cached value. BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); if (BCEntryIt != BlockMaskCache.end()) @@ -8156,42 +8160,6 @@ // All-one mask is modelled as no-mask following the convention for masked // load/store/gather/scatter. Initialize BlockMask to no-mask. VPValue *BlockMask = nullptr; - - if (OrigLoop->getHeader() == BB) { - if (!CM.blockNeedsPredicationForAnyReason(BB)) - return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - - assert(CM.foldTailByMasking() && "must fold the tail"); - - // If we're using the active lane mask for control flow, then we get the - // mask from the active lane mask PHI that is cached in the VPlan. - TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); - if (useActiveLaneMaskForControlFlow(TFStyle)) - return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi(); - - // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by - // constructing the desired canonical IV in the header block as its first - // non-phi instructions. - - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); - HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); - - VPBuilder::InsertPointGuard Guard(Builder); - Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (useActiveLaneMask(TFStyle)) { - VPValue *TC = Plan.getTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, - nullptr, "active.lane.mask"); - } else { - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); - } - return BlockMaskCache[BB] = BlockMask; - } - // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); @@ -8209,6 +8177,40 @@ return BlockMaskCache[BB] = BlockMask; } +void VPRecipeBuilder::createTailFoldHeaderMask(VPlan &Plan) { + assert(CM.foldTailByMasking() && "must fold the tail"); + BasicBlock *Header = OrigLoop->getHeader(); + + // If we're using the active lane mask for control flow, then we get the + // mask from the active lane mask PHI that is cached in the VPlan. + TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); + if (useActiveLaneMaskForControlFlow(TFStyle)) + BlockMaskCache[Header] = Plan.getActiveLaneMaskPhi(); + + // Introduce the early-exit compare IV <= BTC to form header block mask. + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); + auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); + HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); + + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); + VPValue *BlockMask = nullptr; + if (useActiveLaneMask(TFStyle)) { + VPValue *TC = Plan.getTripCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + nullptr, "active.lane.mask"); + } else { + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + } + BlockMaskCache[Header] = BlockMask; +} + VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VFRange &Range, @@ -8907,6 +8909,11 @@ DLInst ? DLInst->getDebugLoc() : DebugLoc(), CM.getTailFoldingStyle(IVUpdateMayOverflow)); + // Proactively create header mask when tail-folding. Masks for other blocks + // are created on demand. + if (CM.foldTailByMasking()) + RecipeBuilder.createTailFoldHeaderMask(*Plan); + // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); @@ -8963,11 +8970,14 @@ } RecipeBuilder.setRecipe(Instr, Recipe); - if (isa(Recipe) && + if (isa(Recipe) && HeaderVPBB->getFirstNonPhi() != VPBB->end()) { - // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the - // phi section of HeaderVPBB. - assert(isa(Instr)); + // Move VPWidenIntOrFpInductionRecipes to the PHI section if the header + // block. This is needed either when + // * tail-folding (mask recipes will be created before the regular + // induction recipes), + // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. + assert(CM.foldTailByMasking() || isa(Instr)); Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); } else VPBB->appendRecipe(Recipe); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -134,10 +134,13 @@ } /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. + /// that the header block of the loop is set to True or the loop mask when + /// tail folding. It returns the *entry* mask for the block BB. VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan); + /// Create the mask for the vector loop header block when tail-folding. + void createTailFoldHeaderMask(VPlan &Plan); + /// A helper function that computes the predicate of the edge between SRC /// and DST. VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlan &Plan);