Please use GitHub pull requests for new patches. Phabricator shutdown timeline
Changeset View
Standalone View
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
- This file is larger than 256 KB, so syntax highlighting is disabled by default.
Show First 20 Lines • Show All 1,562 Lines • ▼ Show 20 Lines | public: | ||||
} | } | ||||
/// Returns true if a scalar epilogue is not allowed due to optsize or a | /// Returns true if a scalar epilogue is not allowed due to optsize or a | ||||
/// loop hint annotation. | /// loop hint annotation. | ||||
bool isScalarEpilogueAllowed() const { | bool isScalarEpilogueAllowed() const { | ||||
return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; | return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; | ||||
} | } | ||||
/// Returns the TailFoldingStyle that is best for the current loop. | /// Returns the TailFoldingStyle that is best for the current loop. | ||||
david-arm: Can you add some `///` comments here please? | |||||
TailFoldingStyle getTailFoldingStyle() const { | TailFoldingStyle | ||||
getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { | |||||
if (!CanFoldTailByMasking) | if (!CanFoldTailByMasking) | ||||
return TailFoldingStyle::None; | return TailFoldingStyle::None; | ||||
if (ForceTailFoldingStyle.getNumOccurrences()) | if (ForceTailFoldingStyle.getNumOccurrences()) | ||||
return ForceTailFoldingStyle; | return ForceTailFoldingStyle; | ||||
return TTI.getPreferredTailFoldingStyle(); | return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); | ||||
} | } | ||||
It seems weird to ask the target for their preferred style and then immediate override it. Is there a reason not to pass IVUpdateCannotOverflow to getPreferredTailFoldingStyle? paulwalker-arm: It seems weird to ask the target for their preferred style and then immediate override it. Is… | |||||
/// Returns true if all loop blocks should be masked to fold tail loop. | /// Returns true if all loop blocks should be masked to fold tail loop. | ||||
bool foldTailByMasking() const { | bool foldTailByMasking() const { | ||||
return getTailFoldingStyle() != TailFoldingStyle::None; | return getTailFoldingStyle() != TailFoldingStyle::None; | ||||
} | } | ||||
/// Returns true if the instructions in this block requires predication | /// Returns true if the instructions in this block requires predication | ||||
/// for any reason, e.g. because tail folding now requires a predicate | /// for any reason, e.g. because tail folding now requires a predicate | ||||
▲ Show 20 Lines • Show All 1,002 Lines • ▼ Show 20 Lines | if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) | ||||
return MaxVScale; | return MaxVScale; | ||||
if (F.hasFnAttribute(Attribute::VScaleRange)) | if (F.hasFnAttribute(Attribute::VScaleRange)) | ||||
return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); | return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); | ||||
return std::nullopt; | return std::nullopt; | ||||
} | } | ||||
/// For the given VF and UF and maximum trip count computed for the loop, return | |||||
/// whether the induction variable might overflow in the vectorized loop. If not, | |||||
/// then we know a runtime overflow check always evaluates to false and can be | |||||
/// removed. | |||||
static bool isIndvarOverflowCheckKnownFalse( | |||||
const LoopVectorizationCostModel *Cost, | |||||
ElementCount VF, std::optional<unsigned> UF = std::nullopt) { | |||||
// Always be conservative if we don't know the exact unroll factor. | |||||
unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); | |||||
Type *IdxTy = Cost->Legal->getWidestInductionType(); | |||||
APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); | |||||
// We know the runtime overflow check is known false iff the (max) trip-count | |||||
// is known and (max) trip-count + (VF * UF) does not overflow in the type of | |||||
// the vector loop induction variable. | |||||
if (unsigned TC = | |||||
Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { | |||||
uint64_t MaxVF = VF.getKnownMinValue(); | |||||
if (VF.isScalable()) { | |||||
std::optional<unsigned> MaxVScale = | |||||
getMaxVScale(*Cost->TheFunction, Cost->TTI); | |||||
if (!MaxVScale) | |||||
return false; | |||||
MaxVF *= *MaxVScale; | |||||
} | |||||
return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); | |||||
} | |||||
return false; | |||||
} | |||||
void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, | void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, | ||||
const VPIteration &Instance, | const VPIteration &Instance, | ||||
VPTransformState &State) { | VPTransformState &State) { | ||||
Value *ScalarInst = State.get(Def, Instance); | Value *ScalarInst = State.get(Def, Instance); | ||||
Value *VectorValue = State.get(Def, Instance.Part); | Value *VectorValue = State.get(Def, Instance.Part); | ||||
VectorValue = Builder.CreateInsertElement( | VectorValue = Builder.CreateInsertElement( | ||||
VectorValue, ScalarInst, | VectorValue, ScalarInst, | ||||
Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); | Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); | ||||
▲ Show 20 Lines • Show All 435 Lines • ▼ Show 20 Lines | auto CreateStep = [&]() -> Value * { | ||||
Value *MinProfTC = | Value *MinProfTC = | ||||
createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); | createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); | ||||
if (!VF.isScalable()) | if (!VF.isScalable()) | ||||
return MinProfTC; | return MinProfTC; | ||||
return Builder.CreateBinaryIntrinsic( | return Builder.CreateBinaryIntrinsic( | ||||
Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); | Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); | ||||
}; | }; | ||||
TailFoldingStyle Style = Cost->getTailFoldingStyle(); | TailFoldingStyle Style = Cost->getTailFoldingStyle(); | ||||
Up to you but personally I think for all instances the reverse polarity IVUpdateCanOverflow reads better. paulwalker-arm: Up to you but personally I think for all instances the reverse polarity `IVUpdateCanOverflow`… | |||||
if (Style == TailFoldingStyle::None) | if (Style == TailFoldingStyle::None) | ||||
This is just a thought, but I think you can probably simplify this to just: TailFoldingStyle Style = Cost->getTailFoldingStyle(); if (Style == TailFoldingStyle::None) CheckMinIters = Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); else if (VF.isScalable() && Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck && !Cost->isIndvarOverflowCheckKnownFalse(VF, UF)) { ... That way you only need to call isIndvarOverflowCheckKnownFalse just before you're about to actually create the checks. What do you think? david-arm: This is just a thought, but I think you can probably simplify this to just:
TailFoldingStyle… | |||||
CheckMinIters = | CheckMinIters = | ||||
Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); | Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); | ||||
else if (VF.isScalable() && | else if (VF.isScalable() && | ||||
!isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && | |||||
Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { | Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { | ||||
// vscale is not necessarily a power-of-2, which means we cannot guarantee | // vscale is not necessarily a power-of-2, which means we cannot guarantee | ||||
// an overflow to zero when updating induction variables and so an | // an overflow to zero when updating induction variables and so an | ||||
// additional overflow check is required before entering the vector loop. | // additional overflow check is required before entering the vector loop. | ||||
// Get the maximum unsigned value for the type. | // Get the maximum unsigned value for the type. | ||||
Value *MaxUIntTripCount = | Value *MaxUIntTripCount = | ||||
ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); | ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); | ||||
▲ Show 20 Lines • Show All 422 Lines • ▼ Show 20 Lines | if (Instruction *V = CSEMap.lookup(&In)) { | ||||
In.eraseFromParent(); | In.eraseFromParent(); | ||||
continue; | continue; | ||||
} | } | ||||
CSEMap[&In] = &In; | CSEMap[&In] = &In; | ||||
} | } | ||||
} | } | ||||
InstructionCost LoopVectorizationCostModel::getVectorCallCost( | InstructionCost LoopVectorizationCostModel::getVectorCallCost( | ||||
Forgive my ignorance here but this doesn't look like a cost model question? paulwalker-arm: Forgive my ignorance here but this doesn't look like a cost model question? | |||||
CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const { | CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const { | ||||
Function *F = CI->getCalledFunction(); | Function *F = CI->getCalledFunction(); | ||||
Type *ScalarRetTy = CI->getType(); | Type *ScalarRetTy = CI->getType(); | ||||
SmallVector<Type *, 4> Tys, ScalarTys; | SmallVector<Type *, 4> Tys, ScalarTys; | ||||
for (auto &ArgOp : CI->args()) | for (auto &ArgOp : CI->args()) | ||||
ScalarTys.push_back(ArgOp->getType()); | ScalarTys.push_back(ArgOp->getType()); | ||||
// Estimate cost of scalarized vector call. The source operands are assumed | // Estimate cost of scalarized vector call. The source operands are assumed | ||||
// to be vectors, so we need to extract individual elements from there, | // to be vectors, so we need to extract individual elements from there, | ||||
// execute VF scalar calls, and then gather the result into the vector return | // execute VF scalar calls, and then gather the result into the vector return | ||||
// value. | // value. | ||||
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||||
num-elts-per-iteration or (VF * UF)? paulwalker-arm: `num-elts-per-iteration` or `(VF * UF)`? | |||||
InstructionCost ScalarCallCost = | InstructionCost ScalarCallCost = | ||||
TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); | TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); | ||||
I wonder if this is safe because getSmallBestKnownTC also returns a value from profiling for the expected trip count, which is more like a hint rather than a definite? david-arm: I wonder if this is safe because `getSmallBestKnownTC` also returns a value from profiling for… | |||||
This doesn't look entirely safe. I think you really want SE.getSmallConstantTripCount? paulwalker-arm: This doesn't look entirely safe. I think you really want `SE.getSmallConstantTripCount`? | |||||
if (VF.isScalar()) | if (VF.isScalar()) | ||||
MaxVF? because Elts is implied. paulwalker-arm: `MaxVF`? because Elts is implied. | |||||
return ScalarCallCost; | return ScalarCallCost; | ||||
We so need to get rid of TTI::getMaxVScale(). I think function attributes should take precedence but then this is the order used within getMaxLegalScalableVF so perhaps best fixed separately. Up to you. paulwalker-arm: We so need to get rid of `TTI::getMaxVScale()`. I think function attributes should take… | |||||
I'd rather fix that separately so that the order is consistent between getMaxLegalScalableVF and here. sdesmalen: I'd rather fix that separately so that the order is consistent between getMaxLegalScalableVF… | |||||
// Compute corresponding vector type for return value and arguments. | // Compute corresponding vector type for return value and arguments. | ||||
Type *RetTy = ToVectorTy(ScalarRetTy, VF); | Type *RetTy = ToVectorTy(ScalarRetTy, VF); | ||||
for (Type *ScalarTy : ScalarTys) | for (Type *ScalarTy : ScalarTys) | ||||
I suppose this can overflow. Perhaps make MaxVFElts an uint64_t given that's what ugt will promote to anyway. paulwalker-arm: I suppose this can overflow. Perhaps make `MaxVFElts` an `uint64_t` given that's what `ugt`… | |||||
Tys.push_back(ToVectorTy(ScalarTy, VF)); | Tys.push_back(ToVectorTy(ScalarTy, VF)); | ||||
// Compute costs of unpacking argument values for the scalar calls and | // Compute costs of unpacking argument values for the scalar calls and | ||||
// packing the return values to a vector. | // packing the return values to a vector. | ||||
InstructionCost ScalarizationCost = | InstructionCost ScalarizationCost = | ||||
getScalarizationOverhead(CI, VF, CostKind); | getScalarizationOverhead(CI, VF, CostKind); | ||||
InstructionCost Cost = | InstructionCost Cost = | ||||
▲ Show 20 Lines • Show All 5,457 Lines • ▼ Show 20 Lines | VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( | ||||
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); | VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); | ||||
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); | VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); | ||||
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); | VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); | ||||
auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); | auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); | ||||
VPBlockUtils::insertBlockAfter(TopRegion, Preheader); | VPBlockUtils::insertBlockAfter(TopRegion, Preheader); | ||||
VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); | VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); | ||||
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); | VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); | ||||
// Don't use getDecisionAndClampRange here, because we don't know the UF | |||||
// so this function is better to be conservative, rather than to split | |||||
// it up into different VPlans. | |||||
bool IVUpdateMayOverflow = false; | |||||
for (ElementCount VF = Range.Start; | |||||
ElementCount::isKnownLT(VF, Range.End); VF *= 2) | |||||
IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); | |||||
Not Done ReplyInline ActionsIs addCanonicalIVRecipes being passed the "wrong" tail folding style a functional issue? or just a corner case that might affect performance. paulwalker-arm: Is `addCanonicalIVRecipes` being passed the "wrong" tail folding style a functional issue? or… | |||||
This is not a functional issue. In the worst case it generates both the runtime check and computes the slightly more expensive runtime trip-count in the preheader. sdesmalen: This is not a functional issue. In the worst case it generates both the runtime check and… | |||||
Instruction *DLInst = | Instruction *DLInst = | ||||
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); | getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); | ||||
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), | addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), | ||||
DLInst ? DLInst->getDebugLoc() : DebugLoc(), | DLInst ? DLInst->getDebugLoc() : DebugLoc(), | ||||
CM.getTailFoldingStyle()); | CM.getTailFoldingStyle(IVUpdateMayOverflow)); | ||||
// Scan the body of the loop in a topological order to visit each basic block | // Scan the body of the loop in a topological order to visit each basic block | ||||
// after having visited its predecessor basic blocks. | // after having visited its predecessor basic blocks. | ||||
LoopBlocksDFS DFS(OrigLoop); | LoopBlocksDFS DFS(OrigLoop); | ||||
DFS.perform(LI); | DFS.perform(LI); | ||||
VPBasicBlock *VPBB = HeaderVPBB; | VPBasicBlock *VPBB = HeaderVPBB; | ||||
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { | for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { | ||||
▲ Show 20 Lines • Show All 1,718 Lines • Show Last 20 Lines |
Can you add some /// comments here please?