Changeset View
Standalone View
llvm/lib/Target/ARM/MVETailPredication.cpp
Show First 20 Lines • Show All 363 Lines • ▼ Show 20 Lines | bool ForceTailPredication = | ||||
EnableTailPredication == TailPredication::ForceEnabled; | EnableTailPredication == TailPredication::ForceEnabled; | ||||
// 1) Test whether entry to the loop is protected by a conditional | // 1) Test whether entry to the loop is protected by a conditional | ||||
// BTC + 1 < 0. In other words, if the scalar trip count overflows, | // BTC + 1 < 0. In other words, if the scalar trip count overflows, | ||||
// becomes negative, we shouldn't enter the loop and creating | // becomes negative, we shouldn't enter the loop and creating | ||||
// tripcount expression BTC + 1 is not safe. So, check that BTC | // tripcount expression BTC + 1 is not safe. So, check that BTC | ||||
// isn't max. This is evaluated in unsigned, because the semantics | // isn't max. This is evaluated in unsigned, because the semantics | ||||
// of @get.active.lane.mask is a ULE comparison. | // of @get.active.lane.mask is a ULE comparison. | ||||
int VectorWidth = VecTy->getNumElements(); | |||||
auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); | auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); | ||||
auto *BTC = SE->getSCEV(BackedgeTakenCount); | auto *BTC = SE->getSCEV(BackedgeTakenCount); | ||||
auto *MaxBTC = SE->getConstantMaxBackedgeTakenCount(L); | |||||
efriedma: I'm forgetting, do we have a check somewhere that BackedgeTakenCount is actually the backedge… | |||||
SjoerdMeijerAuthorUnsubmitted Good point. I thought we had one, but we have similar checks for the IV, which is done in step 3) below on line 464. Will address this in a follow up. SjoerdMeijer: Good point. I thought we had one, but we have similar checks for the IV, which is done in step… | |||||
efriedmaUnsubmitted Not Done ReplyInline ActionsI think I messed up reviewing this. Took me a bit of time to remember what's going on here. SE->getConstantMaxBackedgeTakenCount() is the backedge-taken count of the loop. BTC is the number of elements the loop processes, minus one. If you want to ensure BTC + 1 doesn't overflow, getConstantMaxBackedgeTakenCount() doesn't actually help, unless you prove some connection between the two values. The code currently in IsSafeActiveMask does not try to prove that connection. efriedma: I think I messed up reviewing this. Took me a bit of time to remember what's going on here. | |||||
efriedmaUnsubmitted Not Done ReplyInline ActionsOn a higher-level note, I think this work has shown that trying to force this to work is really complicated, and it might make sense to change the vectorizer to generate something that's easier to analyze. efriedma: On a higher-level note, I think this work has shown that trying to force this to work is really… | |||||
SjoerdMeijerAuthorUnsubmitted Ah, sorry about this. Don't know what I was thinking...somehow had myself convinced, but it's obviously not really what we need. I fully agree about your high-level remark. This whole exercise has shown the limitations of our current approach, i.e. the complex analysis required, which we can hopefully avoid by changing the intrinsic/semantics. So, I am going to pursue that direction, and will soon propose something for this. SjoerdMeijer: Ah, sorry about this. Don't know what I was thinking...somehow had myself convinced, but it's… | |||||
if (isa<SCEVCouldNotCompute>(MaxBTC)) { | |||||
LLVM_DEBUG(dbgs() << "ARM TP: Can't compute SCEV BTC expression: "; | |||||
BTC->dump()); | |||||
return false; | |||||
} | |||||
if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) && | APInt MaxInt = APInt(BTC->getType()->getScalarSizeInBits(), ~0); | ||||
if (dyn_cast<SCEVConstant>(MaxBTC)->getAPInt().eq(MaxInt) && | |||||
efriedmaUnsubmitted Not Done ReplyInline Actionscast<>. efriedma: cast<>. | |||||
!ForceTailPredication) { | !ForceTailPredication) { | ||||
LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: "; | LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: "; | ||||
BTC->dump()); | BTC->dump()); | ||||
return false; | return false; | ||||
} | } | ||||
// 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow: | // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow: | ||||
// | // | ||||
// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount | // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount | ||||
// | // | ||||
// 2.1) First prove overflow can't happen in: | // 2.1) First prove overflow can't happen in: | ||||
// | // | ||||
// ElementCount + (VectorWidth - 1) | // ElementCount + (VectorWidth - 1) | ||||
// | // | ||||
// Because of a lack of context, it is difficult to get a useful bounds on | // Because of a lack of context, it is difficult to get a useful bounds on | ||||
// this expression. But since ElementCount uses the same variables as the | // this expression. But since ElementCount uses the same variables as the | ||||
// TripCount (TC), for which we can find meaningful value ranges, we use that | // TripCount (TC), for which we can find meaningful value ranges, we use that | ||||
// instead and assert that: | // instead and assert that: | ||||
// | // | ||||
// upperbound(TC) <= UINT_MAX - VectorWidth | // upperbound(TC) <= UINT_MAX - VectorWidth | ||||
// | // | ||||
auto *TC = SE->getSCEV(TripCount); | auto *TC = SE->getSCEV(TripCount); | ||||
unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits(); | unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits(); | ||||
int VectorWidth = VecTy->getNumElements(); | |||||
auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); | auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); | ||||
uint64_t MaxMinusVW = Diff.getZExtValue(); | uint64_t MaxMinusVW = Diff.getZExtValue(); | ||||
uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue(); | uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue(); | ||||
if (UpperboundTC > MaxMinusVW && !ForceTailPredication) { | if (UpperboundTC > MaxMinusVW && !ForceTailPredication) { | ||||
LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n"; | LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n"; | ||||
dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n"; | dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n"; | ||||
dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";); | dbgs() << UpperboundTC << " <= " << MaxMinusVW << " == false\n";); | ||||
return false; | return false; | ||||
} | } | ||||
// 2.2) Make sure overflow doesn't happen in final expression: | // 2.2) Make sure overflow doesn't happen in final expression: | ||||
// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount, | // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount, | ||||
// To do this, compare the full ranges of these subexpressions: | // To do this, compare the full ranges of these subexpressions: | ||||
// | // | ||||
// Range(Ceil) <= Range(TC) | // Range(Ceil) <= Range(TC) | ||||
▲ Show 20 Lines • Show All 204 Lines • Show Last 20 Lines |
I'm forgetting, do we have a check somewhere that BackedgeTakenCount is actually the backedge-taken count of the loop?