diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8157,40 +8157,9 @@ // load/store/gather/scatter. Initialize BlockMask to no-mask. VPValue *BlockMask = nullptr; - if (OrigLoop->getHeader() == BB) { if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - - assert(CM.foldTailByMasking() && "must fold the tail"); - - // If we're using the active lane mask for control flow, then we get the - // mask from the active lane mask PHI that is cached in the VPlan. - TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); - if (useActiveLaneMaskForControlFlow(TFStyle)) - return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi(); - - // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by - // constructing the desired canonical IV in the header block as its first - // non-phi instructions. - - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); - HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); - - VPBuilder::InsertPointGuard Guard(Builder); - Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (useActiveLaneMask(TFStyle)) { - VPValue *TC = Plan.getTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, - nullptr, "active.lane.mask"); - } else { - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); - } - return BlockMaskCache[BB] = BlockMask; - } + assert (OrigLoop->getHeader() != BB); // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { @@ -8209,6 +8178,48 @@ return BlockMaskCache[BB] = BlockMask; } +VPValue *VPRecipeBuilder::createTailFoldHeaderMask(VPlan &Plan) { + BasicBlock *Header = OrigLoop->getHeader(); + // Look for cached value. + BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(Header); + if (BCEntryIt != BlockMaskCache.end()) + return BCEntryIt->second; + + // All-one mask is modelled as no-mask following the convention for masked + // load/store/gather/scatter. Initialize BlockMask to no-mask. + VPValue *BlockMask = nullptr; + + assert(CM.foldTailByMasking() && "must fold the tail"); + + // If we're using the active lane mask for control flow, then we get the + // mask from the active lane mask PHI that is cached in the VPlan. + TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); + if (useActiveLaneMaskForControlFlow(TFStyle)) + return BlockMaskCache[Header] = Plan.getActiveLaneMaskPhi(); + + // Introduce the early-exit compare IV <= BTC to form header block mask. + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); + auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); + HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); + + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); + if (useActiveLaneMask(TFStyle)) { + VPValue *TC = Plan.getTripCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + nullptr, "active.lane.mask"); + } else { + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + } + return BlockMaskCache[Header] = BlockMask; +} + VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VFRange &Range, @@ -8903,6 +8914,9 @@ DLInst ? DLInst->getDebugLoc() : DebugLoc(), CM.getTailFoldingStyle(IVUpdateMayOverflow)); + if (CM.foldTailByMasking()) + RecipeBuilder.createTailFoldHeaderMask(*Plan); + // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); @@ -8965,11 +8979,11 @@ } RecipeBuilder.setRecipe(Instr, Recipe); - if (isa(Recipe) && + if (isa(Recipe) && HeaderVPBB->getFirstNonPhi() != VPBB->end()) { // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the // phi section of HeaderVPBB. - assert(isa(Instr)); + assert(CM.foldTailByMasking() || isa(Instr)); Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); } else VPBB->appendRecipe(Recipe); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -138,6 +138,9 @@ /// mask for the block BB. VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan); + /// Create the mask for the vector loop header block when tail-folding. + VPValue *createTailFoldHeaderMask(VPlan &Plan); + /// A helper function that computes the predicate of the edge between SRC /// and DST. VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlan &Plan);