diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1200,7 +1200,11 @@ CM_ScalarEpilogueNotNeededUsePredicate, // Directive indicating we must either tail fold or not vectorize - CM_ScalarEpilogueNotAllowedUsePredicate + CM_ScalarEpilogueNotAllowedUsePredicate, + + // The epilogue can be dropped entirely because the VF handles all + // iterations. + CM_RequiresNoEpilogue }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1570,10 +1574,19 @@ } /// Returns true if all loop blocks should be masked to fold tail loop. - bool foldTailByMasking() const { return FoldTailByMasking; } + bool foldTailByMasking() const { + switch (ScalarEpilogueStatus) { + case CM_RequiresNoEpilogue: + case CM_ScalarEpilogueAllowed: + return false; + case CM_ScalarEpilogueNotAllowedOptSize: + case CM_ScalarEpilogueNotAllowedLowTripLoop: + case CM_ScalarEpilogueNotNeededUsePredicate: + case CM_ScalarEpilogueNotAllowedUsePredicate: + return true; + } - bool blockNeedsPredication(BasicBlock *BB) const { - return foldTailByMasking() || Legal->blockNeedsPredication(BB); + llvm_unreachable("Unexpected status"); } /// A SmallMapVector to store the InLoop reduction op chains, mapping phi @@ -1731,9 +1744,6 @@ /// iterations to execute in the scalar loop. ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - /// All blocks of loop are to be masked to fold tail of scalar iterations. - bool FoldTailByMasking = false; - /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -5734,6 +5744,8 @@ } switch (ScalarEpilogueStatus) { + case CM_RequiresNoEpilogue: + llvm_unreachable("No VF has been chosen to determine this"); case CM_ScalarEpilogueAllowed: return computeFeasibleMaxVF(TC, UserVF); case CM_ScalarEpilogueNotAllowedUsePredicate: @@ -5809,6 +5821,7 @@ if (Rem->isZero()) { // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + ScalarEpilogueStatus = CM_RequiresNoEpilogue; return MaxVF; } @@ -5816,10 +5829,8 @@ // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. - if (Legal->prepareToFoldTailByMasking()) { - FoldTailByMasking = true; + if (Legal->prepareToFoldTailByMasking()) return MaxVF; - } // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. @@ -5945,7 +5956,7 @@ unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); - if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && + if (!A.Width.isScalable() && !B.Width.isScalable() && foldTailByMasking() && MaxTripCount) { // If we are folding the tail and the trip count is a known (possibly small) // constant, the trip count will be rounded up to an integer number of