diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1532,25 +1532,20 @@ })); } - /// Returns true if \p I is an instruction that will be scalarized with - /// predication. Such instructions include conditional stores and - /// instructions that may divide by zero. - /// If a non-zero VF has been calculated, we check if I will be scalarized - /// predication for that VF. - bool isScalarWithPredication(Instruction *I) const; - - // Returns true if \p I is an instruction that will be predicated either - // through scalar predication or masked load/store or masked gather/scatter. - // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I) { - if (!Legal->blockNeedsPredication(I->getParent()) && !foldTailByMasking()) - return false; - // Loads and stores that need some form of masked operation are predicated - // instructions. - if (isa(I) || isa(I)) - return Legal->isMaskRequired(I); - return isScalarWithPredication(I); - } + /// Returns true if \p I is an instruction that requires predication, but + /// cannot use predicated vector ops to implement the widened operations, thus + /// having to fall back on scalarized operations. + /// If \p VFIterationIsPredicated is true, then the vector iteration will use + /// predication to enable only the active lanes handled in the VF iteration, + /// even if the block is not predicated in the original scalar loop (e.g. for + /// tail folding). + bool isScalarWithPredication(Instruction *I, + bool VFIterationIsPredicated) const; + + /// Return true if the instruction requires a predicated vector operation + /// when widening \p I to a vector. Such instructions include conditional + /// stores and instructions that may divide by zero. + bool requiresPredication(Instruction *I, bool VFIterationIsPredicated) const; /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. @@ -5251,16 +5246,40 @@ Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { - if (!Legal->blockNeedsPredication(I->getParent()) && !foldTailByMasking()) +bool LoopVectorizationCostModel::requiresPredication( + Instruction *I, bool VFIterationIsPredicated) const { + // If no predication is used, no predicatation is needed for the widened op. + if (!Legal->blockNeedsPredication(I->getParent()) && !VFIterationIsPredicated) return false; - switch(I->getOpcode()) { + + switch (I->getOpcode()) { default: break; case Instruction::Load: - case Instruction::Store: { - if (!Legal->isMaskRequired(I)) - return false; + case Instruction::Store: + // Some instructions can be speculated, even when predication is used + // for the block. + return Legal->isMaskRequired(I); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + // If predication is used for this block and the operation would otherwise + // be guarded, then this requires a predicated vector operation. + return mayDivideByZero(*I); + } + + // By default, all operations can be widened safely without predication. + return false; +} + +bool LoopVectorizationCostModel::isScalarWithPredication( + Instruction *I, bool VFIterationIsPredicated) const { + if (!requiresPredication(I, VFIterationIsPredicated)) + return false; + + // See if masked.load/store instructions are legal. + if (isa(I) || isa(I)) { auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); const Align Alignment = getLoadStoreAlignment(I); @@ -5269,13 +5288,10 @@ : !(isLegalMaskedStore(Ty, Ptr, Alignment) || TTI.isLegalMaskedScatter(Ty, Alignment)); } - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - return mayDivideByZero(*I); - } - return false; + + // Until the LV adds support for using llvm.vp intrinsics to handle + // these operations, fall back on predicated scalar operations. + return true; } bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( @@ -5340,7 +5356,7 @@ // If the instruction is a store located in a predicated block, it will be // scalarized. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, foldTailByMasking())) return false; // If the instruction's allocated size doesn't equal it's type size, it @@ -5391,7 +5407,7 @@ << *I << "\n"); return; } - if (isScalarWithPredication(I)) { + if (isScalarWithPredication(I, foldTailByMasking())) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); return; @@ -6799,7 +6815,7 @@ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isPredicatedInst(I) && + assert(requiresPredication(I, foldTailByMasking()) && "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && @@ -6827,7 +6843,7 @@ if (!Legal->blockNeedsPredication(BB) && !foldTailByMasking()) continue; for (Instruction &I : *BB) - if (isScalarWithPredication(&I)) { + if (isScalarWithPredication(&I, foldTailByMasking())) { ScalarCostsTy ScalarCosts; // Do not apply discount if scalable, because that would lead to // invalid scalarization costs. @@ -6869,7 +6885,7 @@ // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, foldTailByMasking())) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6916,7 +6932,8 @@ // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I, foldTailByMasking()) && + !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), true, false); @@ -7077,7 +7094,7 @@ // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. - if (isPredicatedInst(I)) { + if (requiresPredication(I, foldTailByMasking())) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch @@ -7465,7 +7482,8 @@ // predicated uniform stores. Today they are treated as any other // predicated store (see added test cases in // invariant-store-vectorization.ll). - if (isa(&I) && isScalarWithPredication(&I)) + if (isa(&I) && + isScalarWithPredication(&I, foldTailByMasking())) NumPredStores++; if (Legal->isUniformMemOp(I)) { @@ -7720,7 +7738,7 @@ // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF.isVector() && isScalarWithPredication(I)) { + if (VF.isVector() && isScalarWithPredication(I, foldTailByMasking())) { InstructionCost Cost = 0; // These instructions have a non-void type, so account for the phi nodes @@ -8906,7 +8924,9 @@ VFRange &Range) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, CM.foldTailByMasking()); + }, Range); if (IsPredicated) @@ -8946,7 +8966,8 @@ // scalarization is profitable or it is predicated. auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); + CM.isProfitableToScalarize(I, VF) || + CM.isScalarWithPredication(I, CM.foldTailByMasking()); }; return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, Range); @@ -9020,7 +9041,10 @@ Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); + [&](ElementCount VF) { + return CM.requiresPredication(I, CM.foldTailByMasking()); + }, + Range); // Even if the instruction is not marked as uniform, there are certain // intrinsic calls that can be effectively treated as such, so we check for @@ -9067,7 +9091,7 @@ continue; auto *RepR = cast_or_null(PredR->getOperand(0)->getDef()); - assert(RepR->isPredicated() && + assert(RepR->requiresPredication() && "expected Replicate recipe to be predicated"); RepR->setAlsoPack(false); }