diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -330,6 +330,12 @@ /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB) const; + /// Return true if the instruction requires a predicated vector operation + /// when widening \p I to a vector. Such instructions include conditional + /// stores and instructions that may divide by zero. + bool requiresPredicatedWidening(Instruction *I, + bool VFIterationIsPredicated) const; + /// Check if this pointer is consecutive when vectorizing. This happens /// when the last index of the GEP is the induction variable, or that the /// pointer itself is an induction variable. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1322,4 +1322,49 @@ return true; } +/// A helper function for checking whether an integer division-related +/// instruction may divide by zero (in which case it must be predicated if +/// executed conditionally in the scalar code). +/// TODO: It may be worthwhile to generalize and check isKnownNonZero(). +/// Non-zero divisors that are non compile-time constants will not be +/// converted into multiplication, so we will still end up scalarizing +/// the division, but can do so w/o predication. +static bool mayDivideByZero(Instruction &I) { + assert((I.getOpcode() == Instruction::UDiv || + I.getOpcode() == Instruction::SDiv || + I.getOpcode() == Instruction::URem || + I.getOpcode() == Instruction::SRem) && + "Unexpected instruction"); + Value *Divisor = I.getOperand(1); + auto *CInt = dyn_cast(Divisor); + return !CInt || CInt->isZero(); +} + +bool LoopVectorizationLegality::requiresPredicatedWidening( + Instruction *I, bool VFIterationIsPredicated) const { + // If no predication is used, no predicatation is needed for the widened op. + if (!blockNeedsPredication(I->getParent()) && !VFIterationIsPredicated) + return false; + + switch (I->getOpcode()) { + default: + break; + case Instruction::Load: + case Instruction::Store: + // Some instructions can be speculated, even when predication is used + // for the block. + return isMaskRequired(I); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + // If predication is used for this block and the operation would otherwise + // be guarded, then this requires a predicated vector operation. + return mayDivideByZero(*I); + } + + // By default, all operations can be widened safely without predication. + return false; +} + } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1511,25 +1511,15 @@ })); } - /// Returns true if \p I is an instruction that will be scalarized with - /// predication. Such instructions include conditional stores and - /// instructions that may divide by zero. - /// If a non-zero VF has been calculated, we check if I will be scalarized - /// predication for that VF. - bool isScalarWithPredication(Instruction *I) const; - - // Returns true if \p I is an instruction that will be predicated either - // through scalar predication or masked load/store or masked gather/scatter. - // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I) { - if (!blockNeedsPredication(I->getParent())) - return false; - // Loads and stores that need some form of masked operation are predicated - // instructions. - if (isa(I) || isa(I)) - return Legal->isMaskRequired(I); - return isScalarWithPredication(I); - } + /// Returns true if \p I is an instruction that requires predicated widening, + /// and cannot use predicated vector ops to implement the widened operations, + /// thus falling back on scalarized operations. + /// If \p VFIterationIsPredicated is true, then the vector iteration will use + /// predication to enable only the active lanes handled in the VF iteration, + /// even if the block is not predicated in the original scalar loop (e.g. for + /// tail folding). + bool isScalarWithPredication(Instruction *I, + bool VFIterationIsPredicated) const; /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. @@ -1576,10 +1566,6 @@ /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } - bool blockNeedsPredication(BasicBlock *BB) const { - return foldTailByMasking() || Legal->blockNeedsPredication(BB); - } - /// A SmallMapVector to store the InLoop reduction op chains, mapping phi /// nodes to the chain of instructions representing the reductions. Uses a /// MapVector to ensure deterministic iteration order. @@ -4926,24 +4912,6 @@ } } -/// A helper function for checking whether an integer division-related -/// instruction may divide by zero (in which case it must be predicated if -/// executed conditionally in the scalar code). -/// TODO: It may be worthwhile to generalize and check isKnownNonZero(). -/// Non-zero divisors that are non compile-time constants will not be -/// converted into multiplication, so we will still end up scalarizing -/// the division, but can do so w/o predication. -static bool mayDivideByZero(Instruction &I) { - assert((I.getOpcode() == Instruction::UDiv || - I.getOpcode() == Instruction::SDiv || - I.getOpcode() == Instruction::URem || - I.getOpcode() == Instruction::SRem) && - "Unexpected instruction"); - Value *Divisor = I.getOperand(1); - auto *CInt = dyn_cast(Divisor); - return !CInt || CInt->isZero(); -} - void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, VPUser &User, VPTransformState &State) { @@ -5334,31 +5302,23 @@ Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { - if (!blockNeedsPredication(I->getParent())) +bool LoopVectorizationCostModel::isScalarWithPredication( + Instruction *I, bool VFIterationIsPredicated) const { + if (!Legal->requiresPredicatedWidening(I, VFIterationIsPredicated)) return false; - switch(I->getOpcode()) { - default: - break; - case Instruction::Load: - case Instruction::Store: { - if (!Legal->isMaskRequired(I)) - return false; - auto *Ptr = getLoadStorePointerOperand(I); - auto *Ty = getLoadStoreType(I); - const Align Alignment = getLoadStoreAlignment(I); - return isa(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || - TTI.isLegalMaskedGather(Ty, Alignment)) - : !(isLegalMaskedStore(Ty, Ptr, Alignment) || - TTI.isLegalMaskedScatter(Ty, Alignment)); - } - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - return mayDivideByZero(*I); - } - return false; + + // Until the LV adds support for using llvm.vp intrinsics to handle + // these operations, fall back on predicated scalar operations. + if (!isa(I) && !isa(I)) + return true; + + auto *Ptr = getLoadStorePointerOperand(I); + auto *Ty = getLoadStoreType(I); + const Align Alignment = getLoadStoreAlignment(I); + return isa(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || + TTI.isLegalMaskedGather(Ty, Alignment)) + : !(isLegalMaskedStore(Ty, Ptr, Alignment) || + TTI.isLegalMaskedScatter(Ty, Alignment)); } bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( @@ -5413,7 +5373,7 @@ // If the instruction is a store located in a predicated block, it will be // scalarized. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, foldTailByMasking())) return false; // If the instruction's allocated size doesn't equal it's type size, it @@ -5463,7 +5423,7 @@ << *I << "\n"); return; } - if (isScalarWithPredication(I)) { + if (isScalarWithPredication(I, foldTailByMasking())) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); return; @@ -6703,7 +6663,7 @@ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isPredicatedInst(I) && + assert(Legal->requiresPredicatedWidening(I, foldTailByMasking()) && "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && @@ -6728,10 +6688,10 @@ // determine if it would be better to not if-convert the blocks they are in. // If so, we also record the instructions to scalarize. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockNeedsPredication(BB)) + if (!Legal->blockNeedsPredication(BB) && !foldTailByMasking()) continue; for (Instruction &I : *BB) - if (isScalarWithPredication(&I)) { + if (isScalarWithPredication(&I, foldTailByMasking())) { ScalarCostsTy ScalarCosts; // Do not apply discount logic if hacked cost is needed // for emulated masked memrefs. @@ -6771,7 +6731,7 @@ // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, foldTailByMasking())) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6819,7 +6779,8 @@ // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I, foldTailByMasking()) && + !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); @@ -6976,7 +6937,7 @@ // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. - if (isPredicatedInst(I)) { + if (Legal->requiresPredicatedWidening(I, foldTailByMasking())) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch @@ -7323,7 +7284,8 @@ // predicated uniform stores. Today they are treated as any other // predicated store (see added test cases in // invariant-store-vectorization.ll). - if (isa(&I) && isScalarWithPredication(&I)) + if (isa(&I) && + isScalarWithPredication(&I, foldTailByMasking())) NumPredStores++; if (Legal->isUniformMemOp(I)) { @@ -7570,7 +7532,7 @@ // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF.isVector() && isScalarWithPredication(I)) { + if (VF.isVector() && isScalarWithPredication(I, foldTailByMasking())) { InstructionCost Cost = 0; // These instructions have a non-void type, so account for the phi nodes @@ -7965,7 +7927,8 @@ return None; // Invalidate interleave groups if all blocks of loop will be predicated. - if (CM.blockNeedsPredication(OrigLoop->getHeader()) && + if ((Legal->blockNeedsPredication(OrigLoop->getHeader()) || + CM.foldTailByMasking()) && !useMaskedInterleavedAccesses(*TTI)) { LLVM_DEBUG( dbgs() @@ -8563,7 +8526,7 @@ VPValue *BlockMask = nullptr; if (OrigLoop->getHeader() == BB) { - if (!CM.blockNeedsPredication(BB)) + if (!Legal->blockNeedsPredication(BB) && !CM.foldTailByMasking()) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. // Create the block in mask as the first non-phi instruction in the block. @@ -8735,7 +8698,9 @@ VFRange &Range) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, CM.foldTailByMasking()); + }, Range); if (IsPredicated) @@ -8777,7 +8742,8 @@ // scalarization is profitable or it is predicated. auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); + CM.isProfitableToScalarize(I, VF) || + CM.isScalarWithPredication(I, CM.foldTailByMasking()); }; return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, Range); @@ -8851,7 +8817,10 @@ Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); + [&](ElementCount VF) { + return Legal->requiresPredicatedWidening(I, CM.foldTailByMasking()); + }, + Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated);