diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8008,6 +8008,47 @@ return EdgeMaskCache[Edge] = EdgeMask; } +void VPRecipeBuilder::createHeaderMask(VPlan &Plan) { + BasicBlock *Header = OrigLoop->getHeader(); + + // When not folding the tail, use nullptr to model all-true mask. + if (!CM.foldTailByMasking()) { + BlockMaskCache[Header] = nullptr; + return; + } + + // If we're using the active lane mask for control flow, then we get the + // mask from the active lane mask PHI that is cached in the VPlan. + TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); + if (useActiveLaneMaskForControlFlow(TFStyle)) { + BlockMaskCache[Header] = Plan.getActiveLaneMaskPhi(); + return; + } + + // Introduce the early-exit compare IV <= BTC to form header block mask. + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); + auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); + HeaderVPBB->insert(IV, NewInsertionPoint); + + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); + VPValue *BlockMask = nullptr; + if (useActiveLaneMask(TFStyle)) { + VPValue *TC = Plan.getTripCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + nullptr, "active.lane.mask"); + } else { + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + } + BlockMaskCache[Header] = BlockMask; +} + VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); @@ -8016,45 +8057,12 @@ if (BCEntryIt != BlockMaskCache.end()) return BCEntryIt->second; + assert(OrigLoop->getHeader() != BB && + "Loop header must have cached block mask"); + // All-one mask is modelled as no-mask following the convention for masked // load/store/gather/scatter. Initialize BlockMask to no-mask. VPValue *BlockMask = nullptr; - - if (OrigLoop->getHeader() == BB) { - if (!CM.blockNeedsPredicationForAnyReason(BB)) - return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - - assert(CM.foldTailByMasking() && "must fold the tail"); - - // If we're using the active lane mask for control flow, then we get the - // mask from the active lane mask PHI that is cached in the VPlan. - TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); - if (useActiveLaneMaskForControlFlow(TFStyle)) - return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi(); - - // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by - // constructing the desired canonical IV in the header block as its first - // non-phi instructions. - - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); - HeaderVPBB->insert(IV, NewInsertionPoint); - - VPBuilder::InsertPointGuard Guard(Builder); - Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (useActiveLaneMask(TFStyle)) { - VPValue *TC = Plan.getTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, - nullptr, "active.lane.mask"); - } else { - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); - } - return BlockMaskCache[BB] = BlockMask; - } - // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); @@ -8766,6 +8774,10 @@ DLInst ? DLInst->getDebugLoc() : DebugLoc(), CM.getTailFoldingStyle(IVUpdateMayOverflow)); + // Proactively create header mask. Masks for other blocks are created on + // demand. + RecipeBuilder.createHeaderMask(*Plan); + // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); @@ -8822,13 +8834,17 @@ } RecipeBuilder.setRecipe(Instr, Recipe); - if (isa(Recipe)) { - // VPWidenIntOrFpInductionRecipes must be kept in the phi section of - // HeaderVPBB. VPWidenIntOrFpInductionRecipes for optimized truncates - // may be generated after non-phi recipes and need to be moved to the - // phi section of HeaderVPBB. + if (isa(Recipe)) { + // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In + // the following cases, VPHeaderPHIRecipes may be created after non-phi + // recipes and need to be moved to the phi section of HeaderVPBB: + // * tail-folding (non-phi recipes computing the header mask are + // introduced earlier than regular header phi recipes, and should appear + // after them) + // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. + assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || - isa(Instr)) && + CM.foldTailByMasking() || isa(Instr)) && "unexpected recipe needs moving"); Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); } else diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -133,9 +133,12 @@ Ingredient2Recipe[I] = R; } + /// Create the mask for the vector loop header block. + void createHeaderMask(VPlan &Plan); + /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. + /// that the header block of the loop is set to True or the loop mask when + /// tail folding. It returns the *entry* mask for the block BB. VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan); /// A helper function that computes the predicate of the edge between SRC