diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7994,6 +7994,13 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + // For the loop header return the cached block mask. + if (OrigLoop->getHeader() == BB) { + auto Iter = BlockMaskCache.find(BB); + assert(Iter != BlockMaskCache.end() && "header mask must be set"); + return Iter->second; + } + // Look for cached value. BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); if (BCEntryIt != BlockMaskCache.end()) @@ -8002,42 +8009,6 @@ // All-one mask is modelled as no-mask following the convention for masked // load/store/gather/scatter. Initialize BlockMask to no-mask. VPValue *BlockMask = nullptr; - - if (OrigLoop->getHeader() == BB) { - if (!CM.blockNeedsPredicationForAnyReason(BB)) - return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - - assert(CM.foldTailByMasking() && "must fold the tail"); - - // If we're using the active lane mask for control flow, then we get the - // mask from the active lane mask PHI that is cached in the VPlan. - TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); - if (useActiveLaneMaskForControlFlow(TFStyle)) - return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi(); - - // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by - // constructing the desired canonical IV in the header block as its first - // non-phi instructions. - - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); - HeaderVPBB->insert(IV, NewInsertionPoint); - - VPBuilder::InsertPointGuard Guard(Builder); - Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (useActiveLaneMask(TFStyle)) { - VPValue *TC = Plan.getTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, - nullptr, "active.lane.mask"); - } else { - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); - } - return BlockMaskCache[BB] = BlockMask; - } - // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); @@ -8055,6 +8026,47 @@ return BlockMaskCache[BB] = BlockMask; } +void VPRecipeBuilder::createHeaderMask(VPlan &Plan) { + BasicBlock *Header = OrigLoop->getHeader(); + + // When not folding the tail, use nullptr to model all-true mask. + if (!CM.foldTailByMasking()) { + BlockMaskCache[Header] = nullptr; + return; + } + + // If we're using the active lane mask for control flow, then we get the + // mask from the active lane mask PHI that is cached in the VPlan. + TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); + if (useActiveLaneMaskForControlFlow(TFStyle)) { + BlockMaskCache[Header] = Plan.getActiveLaneMaskPhi(); + return; + } + + // Introduce the early-exit compare IV <= BTC to form header block mask. + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); + auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); + HeaderVPBB->insert(IV, NewInsertionPoint); + + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); + VPValue *BlockMask = nullptr; + if (useActiveLaneMask(TFStyle)) { + VPValue *TC = Plan.getTripCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + nullptr, "active.lane.mask"); + } else { + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + } + BlockMaskCache[Header] = BlockMask; +} + VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VFRange &Range, @@ -8749,6 +8761,9 @@ DLInst ? DLInst->getDebugLoc() : DebugLoc(), CM.getTailFoldingStyle(IVUpdateMayOverflow)); + // Proactively create header. Masks for other blocks are created on demand. + RecipeBuilder.createHeaderMask(*Plan); + // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); @@ -8805,13 +8820,17 @@ } RecipeBuilder.setRecipe(Instr, Recipe); - if (isa(Recipe)) { - // VPWidenIntOrFpInductionRecipes must be kept in the phi section of - // HeaderVPBB. VPWidenIntOrFpInductionRecipes for optimized truncates - // may be generated after non-phi recipes and need to be moved to the - // phi section of HeaderVPBB. + if (isa(Recipe)) { + // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In + // the following cases, VPHeaderPHIRecipes may be created after non-phi + // recipes and need to be moved to the phi section of HeaderVPBB: + // * tail-folding (non-phi recipes computing the header mask are + // introduced earlier than regular header phi recipes, and should appear + // after them) + // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. + assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || - isa(Instr)) && + CM.foldTailByMasking() || isa(Instr)) && "unexpected recipe needs moving"); Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); } else diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -134,10 +134,13 @@ } /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. + /// that the header block of the loop is set to True or the loop mask when + /// tail folding. It returns the *entry* mask for the block BB. VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan); + /// Create the mask for the vector loop header block. + void createHeaderMask(VPlan &Plan); + /// A helper function that computes the predicate of the edge between SRC /// and DST. VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlan &Plan);