Changeset View
Standalone View
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
- This file is larger than 256 KB, so syntax highlighting is disabled by default.
Show First 20 Lines • Show All 977 Lines • ▼ Show 20 Lines | public: | ||||
/// The calculated cost is saved with widening decision in order to | /// The calculated cost is saved with widening decision in order to | ||||
/// avoid redundant calculations. | /// avoid redundant calculations. | ||||
void setCostBasedWideningDecision(unsigned VF); | void setCostBasedWideningDecision(unsigned VF); | ||||
/// A struct that represents some properties of the register usage | /// A struct that represents some properties of the register usage | ||||
/// of a loop. | /// of a loop. | ||||
struct RegisterUsage { | struct RegisterUsage { | ||||
/// Holds the number of loop invariant values that are used in the loop. | /// Holds the number of loop invariant values that are used in the loop. | ||||
unsigned LoopInvariantRegs; | /// The key is ClassID of target-provided register class. | ||||
SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; | |||||
/// Holds the maximum number of concurrent live intervals in the loop. | /// Holds the maximum number of concurrent live intervals in the loop. | ||||
unsigned MaxLocalUsers; | /// The key is ClassID of target-provided register class. | ||||
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; | |||||
}; | }; | ||||
/// \return Returns information about the register usages of the loop for the | /// \return Returns information about the register usages of the loop for the | ||||
/// given vectorization factors. | /// given vectorization factors. | ||||
SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); | SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); | ||||
/// Collect values we want to ignore in the cost model. | /// Collect values we want to ignore in the cost model. | ||||
void collectValuesToIgnore(); | void collectValuesToIgnore(); | ||||
▲ Show 20 Lines • Show All 3,925 Lines • ▼ Show 20 Lines | if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || | ||||
for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) | for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) | ||||
VFs.push_back(VS); | VFs.push_back(VS); | ||||
// For each VF calculate its register usage. | // For each VF calculate its register usage. | ||||
auto RUs = calculateRegisterUsage(VFs); | auto RUs = calculateRegisterUsage(VFs); | ||||
// Select the largest VF which doesn't require more registers than existing | // Select the largest VF which doesn't require more registers than existing | ||||
// ones. | // ones. | ||||
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); | |||||
for (int i = RUs.size() - 1; i >= 0; --i) { | for (int i = RUs.size() - 1; i >= 0; --i) { | ||||
if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { | bool Selected = true; | ||||
for (auto& pair : RUs[i].MaxLocalUsers) { | |||||
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); | |||||
if (pair.second > TargetNumRegisters) | |||||
Selected = false; | |||||
} | |||||
if (Selected) { | |||||
MaxVF = VFs[i]; | MaxVF = VFs[i]; | ||||
break; | break; | ||||
} | } | ||||
} | } | ||||
if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { | if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { | ||||
if (MaxVF < MinVF) { | if (MaxVF < MinVF) { | ||||
LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF | LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF | ||||
<< ") with target's minimum: " << MinVF << '\n'); | << ") with target's minimum: " << MinVF << '\n'); | ||||
▲ Show 20 Lines • Show All 134 Lines • ▼ Show 20 Lines | unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, | ||||
if (Legal->getMaxSafeDepDistBytes() != -1U) | if (Legal->getMaxSafeDepDistBytes() != -1U) | ||||
return 1; | return 1; | ||||
// Do not interleave loops with a relatively small trip count. | // Do not interleave loops with a relatively small trip count. | ||||
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); | unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); | ||||
if (TC > 1 && TC < TinyTripCountInterleaveThreshold) | if (TC > 1 && TC < TinyTripCountInterleaveThreshold) | ||||
return 1; | return 1; | ||||
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); | |||||
LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters | |||||
<< " registers\n"); | |||||
if (VF == 1) { | |||||
if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) | |||||
TargetNumRegisters = ForceTargetNumScalarRegs; | |||||
} else { | |||||
if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) | |||||
TargetNumRegisters = ForceTargetNumVectorRegs; | |||||
} | |||||
RegisterUsage R = calculateRegisterUsage({VF})[0]; | RegisterUsage R = calculateRegisterUsage({VF})[0]; | ||||
// We divide by these constants so assume that we have at least one | // We divide by these constants so assume that we have at least one | ||||
// instruction that uses at least one register. | // instruction that uses at least one register. | ||||
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); | for (auto& pair : R.MaxLocalUsers) { | ||||
pair.second = std::max(pair.second, 1U); | |||||
} | |||||
// We calculate the interleave count using the following formula. | // We calculate the interleave count using the following formula. | ||||
// Subtract the number of loop invariants from the number of available | // Subtract the number of loop invariants from the number of available | ||||
// registers. These registers are used by all of the interleaved instances. | // registers. These registers are used by all of the interleaved instances. | ||||
// Next, divide the remaining registers by the number of registers that is | // Next, divide the remaining registers by the number of registers that is | ||||
// required by the loop, in order to estimate how many parallel instances | // required by the loop, in order to estimate how many parallel instances | ||||
// fit without causing spills. All of this is rounded down if necessary to be | // fit without causing spills. All of this is rounded down if necessary to be | ||||
// a power of two. We want power of two interleave count to simplify any | // a power of two. We want power of two interleave count to simplify any | ||||
// addressing operations or alignment considerations. | // addressing operations or alignment considerations. | ||||
// We also want power of two interleave counts to ensure that the induction | // We also want power of two interleave counts to ensure that the induction | ||||
// variable of the vector loop wraps to zero, when tail is folded by masking; | // variable of the vector loop wraps to zero, when tail is folded by masking; | ||||
// this currently happens when OptForSize, in which case IC is set to 1 above. | // this currently happens when OptForSize, in which case IC is set to 1 above. | ||||
unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / | unsigned IC = UINT_MAX; | ||||
R.MaxLocalUsers); | |||||
for (auto& pair : R.MaxLocalUsers) { | |||||
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); | |||||
LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters | |||||
<< " registers of " | |||||
<< TTI.getRegisterClassName(pair.first) << " register class\n"); | |||||
if (VF == 1) { | |||||
if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) | |||||
TargetNumRegisters = ForceTargetNumScalarRegs; | |||||
} else { | |||||
if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) | |||||
TargetNumRegisters = ForceTargetNumVectorRegs; | |||||
} | |||||
unsigned MaxLocalUsers = pair.second; | |||||
unsigned LoopInvariantRegs = 0; | |||||
if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) | |||||
LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; | |||||
unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); | |||||
// Don't count the induction variable as interleaved. | // Don't count the induction variable as interleaved. | ||||
if (EnableIndVarRegisterHeur) | if (EnableIndVarRegisterHeur) { | ||||
IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / | TmpIC = | ||||
std::max(1U, (R.MaxLocalUsers - 1))); | PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / | ||||
std::max(1U, (MaxLocalUsers - 1))); | |||||
} | |||||
IC = std::min(IC, TmpIC); | |||||
} | |||||
// Clamp the interleave ranges to reasonable counts. | // Clamp the interleave ranges to reasonable counts. | ||||
unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); | unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); | ||||
// Check if the user has overridden the max. | // Check if the user has overridden the max. | ||||
if (VF == 1) { | if (VF == 1) { | ||||
if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) | if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) | ||||
MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; | MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; | ||||
▲ Show 20 Lines • Show All 165 Lines • ▼ Show 20 Lines | LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { | ||||
unsigned MaxSafeDepDist = -1U; | unsigned MaxSafeDepDist = -1U; | ||||
if (Legal->getMaxSafeDepDistBytes() != -1U) | if (Legal->getMaxSafeDepDistBytes() != -1U) | ||||
MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; | MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; | ||||
unsigned WidestRegister = | unsigned WidestRegister = | ||||
std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); | std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); | ||||
const DataLayout &DL = TheFunction->getParent()->getDataLayout(); | const DataLayout &DL = TheFunction->getParent()->getDataLayout(); | ||||
SmallVector<RegisterUsage, 8> RUs(VFs.size()); | SmallVector<RegisterUsage, 8> RUs(VFs.size()); | ||||
SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); | SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); | ||||
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); | LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); | ||||
// A lambda that gets the register usage for the given type and VF. | // A lambda that gets the register usage for the given type and VF. | ||||
auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { | auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { | ||||
if (Ty->isTokenTy()) | if (Ty->isTokenTy()) | ||||
return 0U; | return 0U; | ||||
unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); | unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); | ||||
Show All 13 Lines | if (Ends.find(I) == Ends.end()) | ||||
continue; | continue; | ||||
// Skip ignored values. | // Skip ignored values. | ||||
if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) | if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) | ||||
continue; | continue; | ||||
// For each VF find the maximum usage of registers. | // For each VF find the maximum usage of registers. | ||||
for (unsigned j = 0, e = VFs.size(); j < e; ++j) { | for (unsigned j = 0, e = VFs.size(); j < e; ++j) { | ||||
// Count the number of live intervals. | |||||
SmallMapVector<unsigned, unsigned, 4> RegUsage; | |||||
if (VFs[j] == 1) { | if (VFs[j] == 1) { | ||||
MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); | for (auto Inst : OpenIntervals) { | ||||
continue; | unsigned ClassID = TTI.getRegisterClassForType(Inst->getType(), false); | ||||
if (RegUsage.find(ClassID) == RegUsage.end()) | |||||
RegUsage[ClassID] = 1; | |||||
else | |||||
RegUsage[ClassID] += 1; | |||||
} | } | ||||
} else { | |||||
collectUniformsAndScalars(VFs[j]); | collectUniformsAndScalars(VFs[j]); | ||||
// Count the number of live intervals. | |||||
unsigned RegUsage = 0; | |||||
for (auto Inst : OpenIntervals) { | for (auto Inst : OpenIntervals) { | ||||
// Skip ignored values for VF > 1. | // Skip ignored values for VF > 1. | ||||
if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || | if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) | ||||
isScalarAfterVectorization(Inst, VFs[j])) | |||||
continue; | continue; | ||||
RegUsage += GetRegUsage(Inst->getType(), VFs[j]); | if (isScalarAfterVectorization(Inst, VFs[j])) { | ||||
unsigned ClassID = TTI.getRegisterClassForType(Inst->getType(), false); | |||||
if (RegUsage.find(ClassID) == RegUsage.end()) | |||||
RegUsage[ClassID] = 1; | |||||
else | |||||
RegUsage[ClassID] += 1; | |||||
} else { | |||||
unsigned ClassID = TTI.getRegisterClassForType(Inst->getType(), true); | |||||
if (RegUsage.find(ClassID) == RegUsage.end()) | |||||
RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); | |||||
else | |||||
RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); | |||||
} | |||||
} | |||||
} | |||||
for (auto& pair : RegUsage) { | |||||
if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) | |||||
MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); | |||||
else | |||||
MaxUsages[j][pair.first] = pair.second; | |||||
} | } | ||||
MaxUsages[j] = std::max(MaxUsages[j], RegUsage); | |||||
} | } | ||||
LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " | LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " | ||||
<< OpenIntervals.size() << '\n'); | << OpenIntervals.size() << '\n'); | ||||
// Add the current instruction to the list of open intervals. | // Add the current instruction to the list of open intervals. | ||||
OpenIntervals.insert(I); | OpenIntervals.insert(I); | ||||
} | } | ||||
for (unsigned i = 0, e = VFs.size(); i < e; ++i) { | for (unsigned i = 0, e = VFs.size(); i < e; ++i) { | ||||
unsigned Invariant = 0; | SmallMapVector<unsigned, unsigned, 4> Invariant; | ||||
if (VFs[i] == 1) | |||||
Invariant = LoopInvariants.size(); | for (auto Inst : LoopInvariants) { | ||||
else { | unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); | ||||
for (auto Inst : LoopInvariants) | unsigned ClassID = TTI.getRegisterClassForType(Inst->getType(), VFs[i] > 1); | ||||
Invariant += GetRegUsage(Inst->getType(), VFs[i]); | if (Invariant.find(ClassID) == Invariant.end()) | ||||
Invariant[ClassID] = Usage; | |||||
else | |||||
Invariant[ClassID] += Usage; | |||||
} | } | ||||
LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); | LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); | ||||
LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); | LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " | ||||
LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant | << MaxUsages[i].size() << " item\n"); | ||||
<< '\n'); | for (const auto& pair : MaxUsages[i]) { | ||||
LLVM_DEBUG(dbgs() << "LV(REG): RegisterClass: " | |||||
<< TTI.getRegisterClassName(pair.first) | |||||
<< ", " << pair.second << " registers \n"); | |||||
} | |||||
LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " | |||||
<< Invariant.size() << " item\n"); | |||||
for (const auto& pair : Invariant) { | |||||
LLVM_DEBUG(dbgs() << "LV(REG): RegisterClass: " | |||||
<< TTI.getRegisterClassName(pair.first) | |||||
<< ", " << pair.second << " registers \n"); | |||||
} | |||||
RU.LoopInvariantRegs = Invariant; | RU.LoopInvariantRegs = Invariant; | ||||
RU.MaxLocalUsers = MaxUsages[i]; | RU.MaxLocalUsers = MaxUsages[i]; | ||||
RUs[i] = RU; | RUs[i] = RU; | ||||
} | } | ||||
return RUs; | return RUs; | ||||
} | } | ||||
▲ Show 20 Lines • Show All 2,329 Lines • ▼ Show 20 Lines | bool LoopVectorizePass::runImpl( | ||||
// Don't attempt if | // Don't attempt if | ||||
// 1. the target claims to have no vector registers, and | // 1. the target claims to have no vector registers, and | ||||
// 2. interleaving won't help ILP. | // 2. interleaving won't help ILP. | ||||
// | // | ||||
// The second condition is necessary because, even if the target has no | // The second condition is necessary because, even if the target has no | ||||
// vector registers, loop vectorization may still enable scalar | // vector registers, loop vectorization may still enable scalar | ||||
// interleaving. | // interleaving. | ||||
if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) | if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(F.getType(), true)) | ||||
&& TTI->getMaxInterleaveFactor(1) < 2) | |||||
hfinkel: I think that we can just make a separate function for this:
TTI->hasVectorRegisters()
(and… | |||||
I think it's could be like TTI->getRegisterClassForType(F.getType(), true) above wuzish: I think it's could be like `TTI->getRegisterClassForType(F.getType(), true)` above | |||||
But above, F.getType() gives back the right scalar type because F is the LSR::Formula. Here F in the function, right? I don't think it makes sense to ask for the register class of the function type. hfinkel: But above, F.getType() gives back the right scalar type because F is the LSR::Formula. Here F… | |||||
Yes. And above case would return nullptr, so we need care about this situation. And here we can left type to be nullptr as default value argument. wuzish: Yes. And above case would return nullptr, so we need care about this situation. And here we can… | |||||
I suppose that this makes sense if we assume that TTI->getRegisterClassForType is called only on legal types? I'm a bit worried here because, at the IR level, all types are supported and should be legalized into *some* register class. We should better document the expected behavior here one way or the other. hfinkel: I suppose that this makes sense if we assume that `TTI->getRegisterClassForType` is called only… | |||||
If I am not misunderstanding what you mean, then I think the type should be isSingleValueType. We can document it at the comments in getRegisterClassForType prototype declare. wuzish: If I am not misunderstanding what you mean, then I think the type should be `isSingleValueType`. | |||||
I agree, but I think that's insufficient. isSingleValueType is true for vector types, and so if we have an architecture with no vector registers, and we call TTI->getRegisterClassForType on a vector type, it would return the class associated with the scalarized type. I think that what you intend to say is something like: getRegisterClassForType returns the register class associated with the provided type, accounting for type promotion and other type-legalization techniques that the target might apply, however, it specifically does not account for the scalarization or splitting of vector types. Should a vector type require scalarization or splitting into multiple underlying vector registers, that type should be mapped to a register class containing no registers. In some sense, this seems reasonable, because the interface does not provide any way to figure out *how many* of a particular register in a register class the type might use, and so we can't return a sensible answer in cases where splitting or scalarization is required. It's a bit unfortunate, however, because the same consideration applies to scalar types too (e.g., i256 probably takes up multiple scalar registers). What we really should do, I think, is expand this interface to return, perhaps optionally, the number of registers of the provided class. In the implementation, in such a case, could start by running: std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); and then using the MVT for making the register-class decisions. However, it does not seem possible to make this change while retaining the current behavior for other targets (because we'd change what happens for illegal types), and thus, I recommend that a comment be added along the lines of my suggestion above, and then we also add this: // FIXME: It's not currently possible to determine how many registers are used by the provided type. and we address this aspect later in a different patch. hfinkel: I agree, but I think that's insufficient. isSingleValueType is true for vector types, and so if… | |||||
May the number issue be addressed by such code in LoopVectorize.cpp? // A lambda that gets the register usage for the given type and VF. auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { if (Ty->isTokenTy()) return 0U; unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); return std::max<unsigned>(1, VF * TypeSize / WidestRegister); }; And the WidestRegister is from TTI.getRegisterBitWidth which I think should be related with Type*. wuzish: May the number issue be addressed by such code in LoopVectorize.cpp?
```
// A lambda that gets… | |||||
return false; | return false; | ||||
bool Changed = false; | bool Changed = false; | ||||
// The vectorizer requires loops to be in simplified form. | // The vectorizer requires loops to be in simplified form. | ||||
// Since simplification may add new inner loops, it has to run before the | // Since simplification may add new inner loops, it has to run before the | ||||
// legality and profitability checks. This means running the loop vectorizer | // legality and profitability checks. This means running the loop vectorizer | ||||
// will simplify all loops, regardless of whether anything end up being | // will simplify all loops, regardless of whether anything end up being | ||||
▲ Show 20 Lines • Show All 74 Lines • Show Last 20 Lines |
I think that we can just make a separate function for this:
(and then use that here and in the SLP vectorizer).