Changeset View
Changeset View
Standalone View
Standalone View
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
- This file is larger than 256 KB, so syntax highlighting is disabled by default.
Show First 20 Lines • Show All 5,395 Lines • ▼ Show 20 Lines | InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { | ||||
SmallPtrSet<Value *, 16> ExtractCostCalculated; | SmallPtrSet<Value *, 16> ExtractCostCalculated; | ||||
InstructionCost ExtractCost = 0; | InstructionCost ExtractCost = 0; | ||||
SmallVector<unsigned> VF; | SmallVector<unsigned> VF; | ||||
SmallVector<SmallVector<int>> ShuffleMask; | SmallVector<SmallVector<int>> ShuffleMask; | ||||
SmallVector<Value *> FirstUsers; | SmallVector<Value *> FirstUsers; | ||||
SmallVector<APInt> DemandedElts; | SmallVector<APInt> DemandedElts; | ||||
for (ExternalUser &EU : ExternalUses) { | for (ExternalUser &EU : ExternalUses) { | ||||
// We only add extract cost once for the same scalar. | // We only add extract cost once for the same scalar. | ||||
if (!ExtractCostCalculated.insert(EU.Scalar).second) | if (!isa_and_nonnull<InsertElementInst>(EU.User) && | ||||
!ExtractCostCalculated.insert(EU.Scalar).second) | |||||
continue; | continue; | ||||
// Uses by ephemeral values are free (because the ephemeral value will be | // Uses by ephemeral values are free (because the ephemeral value will be | ||||
// removed prior to code generation, and so the extraction will be | // removed prior to code generation, and so the extraction will be | ||||
// removed as well). | // removed as well). | ||||
if (EphValues.count(EU.User)) | if (EphValues.count(EU.User)) | ||||
continue; | continue; | ||||
// No extract cost for vector "scalar" | // No extract cost for vector "scalar" | ||||
if (isa<FixedVectorType>(EU.Scalar->getType())) | if (isa<FixedVectorType>(EU.Scalar->getType())) | ||||
continue; | continue; | ||||
// Already counted the cost for external uses when tried to adjust the cost | // Already counted the cost for external uses when tried to adjust the cost | ||||
// for extractelements, no need to add it again. | // for extractelements, no need to add it again. | ||||
if (isa<ExtractElementInst>(EU.Scalar)) | if (isa<ExtractElementInst>(EU.Scalar)) | ||||
continue; | continue; | ||||
// If found user is an insertelement, do not calculate extract cost but try | // If found user is an insertelement, do not calculate extract cost but try | ||||
// to detect it as a final shuffled/identity match. | // to detect it as a final shuffled/identity match. | ||||
if (isa_and_nonnull<InsertElementInst>(EU.User)) { | if (isa_and_nonnull<InsertElementInst>(EU.User)) { | ||||
RKSimon: Pull out this NFC? | |||||
if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) { | if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) { | ||||
Optional<int> InsertIdx = getInsertIndex(EU.User, 0); | Optional<int> InsertIdx = getInsertIndex(EU.User, 0); | ||||
if (!InsertIdx || *InsertIdx == UndefMaskElem) | if (!InsertIdx || *InsertIdx == UndefMaskElem) | ||||
continue; | continue; | ||||
Value *VU = EU.User; | Value *VU = EU.User; | ||||
auto *It = find_if(FirstUsers, [VU](Value *V) { | auto *It = find_if(FirstUsers, [VU](Value *V) { | ||||
// Checks if 2 insertelements are from the same buildvector. | // Checks if 2 insertelements are from the same buildvector. | ||||
if (VU->getType() != V->getType()) | if (VU->getType() != V->getType()) | ||||
return false; | return false; | ||||
auto *IE1 = cast<InsertElementInst>(VU); | auto *IE1 = cast<InsertElementInst>(VU); | ||||
auto *IE2 = cast<InsertElementInst>(V); | auto *IE2 = cast<InsertElementInst>(V); | ||||
// Go through of insertelement instructions trying to find either VU | // Go through of insertelement instructions trying to find either VU | ||||
// as the original vector for IE2 or V as the original vector for IE1. | // as the original vector for IE2 or V as the original vector for IE1. | ||||
Not Done ReplyInline ActionsPull out this NFC? RKSimon: Pull out this NFC? | |||||
do { | do { | ||||
if (IE1 == VU || IE2 == V) | if (IE1 == VU || IE2 == V) | ||||
return true; | return true; | ||||
if (IE1) | if (IE1) | ||||
IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); | IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); | ||||
if (IE2) | if (IE2) | ||||
IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); | IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); | ||||
} while (IE1 || IE2); | } while (IE1 || IE2); | ||||
return false; | return false; | ||||
}); | }); | ||||
int VecId = -1; | int VecId = -1; | ||||
if (It == FirstUsers.end()) { | if (It == FirstUsers.end()) { | ||||
VF.push_back(FTy->getNumElements()); | VF.push_back(FTy->getNumElements()); | ||||
ShuffleMask.emplace_back(VF.back(), UndefMaskElem); | ShuffleMask.emplace_back(VF.back(), UndefMaskElem); | ||||
FirstUsers.push_back(EU.User); | // Find the insertvector, vectorized in tree, if any. | ||||
Value *Base = VU; | |||||
while (isa<InsertElementInst>(Base)) { | |||||
// Build the mask for the vectorized insertelement instructions. | |||||
if (const TreeEntry *E = getTreeEntry(Base)) { | |||||
VU = Base; | |||||
do { | |||||
int Idx = E->findLaneForValue(Base); | |||||
ShuffleMask.back()[Idx] = Idx; | |||||
Base = cast<InsertElementInst>(Base)->getOperand(0); | |||||
} while (E == getTreeEntry(Base)); | |||||
break; | |||||
} | |||||
Base = cast<InsertElementInst>(Base)->getOperand(0); | |||||
} | |||||
FirstUsers.push_back(VU); | |||||
DemandedElts.push_back(APInt::getZero(VF.back())); | DemandedElts.push_back(APInt::getZero(VF.back())); | ||||
Not Done ReplyInline ActionsShould this be put inside the while() loop before the break? AFAICT that's the only time that cast<InsertElementInst>(Base) is valid. RKSimon: Should this be put inside the while() loop before the break? AFAICT that's the only time that… | |||||
No, not quite so. It can be insertelement also if ScalarToTreeEntry.count(Base) is true, i.e. there is a tree entry for this insertelement in the graph. ABataev: No, not quite so. It can be insertelement also if `ScalarToTreeEntry.count(Base)` is true, i.e. | |||||
Not Done ReplyInline ActionsI don't quite follow, this is what I had in mind: // Find the insertvector, vectorized in tree, if any. Value *Base = VU; while (isa<InsertElementInst>(Base)) { if (ScalarToTreeEntry.count(Base)) { VU = Base; // Build the mask for the vectorized insertelement instructions. if (const TreeEntry *E = getTreeEntry(Base)) { do { int Idx = E->findLaneForValue(Base); ShuffleMask.back()[Idx] = Idx; Base = cast<InsertElementInst>(Base)->getOperand(0); } while (E == getTreeEntry(Base)); } break; } Base = cast<InsertElementInst>(Base)->getOperand(0); } RKSimon: I don't quite follow, this is what I had in mind:
```
// Find the insertvector, vectorized in… | |||||
Oh, I see. Actually, I was going to rework this block of code in a pretty similar way. ABataev: Oh, I see. Actually, I was going to rework this block of code in a pretty similar way. | |||||
VecId = FirstUsers.size() - 1; | VecId = FirstUsers.size() - 1; | ||||
} else { | } else { | ||||
VecId = std::distance(FirstUsers.begin(), It); | VecId = std::distance(FirstUsers.begin(), It); | ||||
} | } | ||||
int Idx = *InsertIdx; | int Idx = *InsertIdx; | ||||
ShuffleMask[VecId][Idx] = EU.Lane; | ShuffleMask[VecId][Idx] = EU.Lane; | ||||
DemandedElts[VecId].setBit(Idx); | DemandedElts[VecId].setBit(Idx); | ||||
continue; | |||||
} | } | ||||
} | } | ||||
// If we plan to rewrite the tree in a smaller type, we will need to sign | // If we plan to rewrite the tree in a smaller type, we will need to sign | ||||
// extend the extracted value back to the original type. Here, we account | // extend the extracted value back to the original type. Here, we account | ||||
// for the extract and the added cost of the sign extend if needed. | // for the extract and the added cost of the sign extend if needed. | ||||
auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); | auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); | ||||
auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; | auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; | ||||
if (MinBWs.count(ScalarRoot)) { | if (MinBWs.count(ScalarRoot)) { | ||||
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); | auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); | ||||
auto Extend = | auto Extend = | ||||
MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; | MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; | ||||
VecTy = FixedVectorType::get(MinTy, BundleWidth); | VecTy = FixedVectorType::get(MinTy, BundleWidth); | ||||
ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), | ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), | ||||
VecTy, EU.Lane); | VecTy, EU.Lane); | ||||
} else { | } else { | ||||
ExtractCost += | ExtractCost += | ||||
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); | TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); | ||||
} | } | ||||
} | } | ||||
InstructionCost SpillCost = getSpillCost(); | InstructionCost SpillCost = getSpillCost(); | ||||
Cost += SpillCost + ExtractCost; | Cost += SpillCost + ExtractCost; | ||||
for (int I = 0, E = FirstUsers.size(); I < E; ++I) { | if (FirstUsers.size() == 1) { | ||||
// For the very first element - simple shuffle of the source vector. | int Limit = ShuffleMask.front().size() * 2; | ||||
int Limit = ShuffleMask[I].size() * 2; | if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) && | ||||
if (I == 0 && | !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) { | ||||
all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) && | |||||
!ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { | |||||
InstructionCost C = TTI->getShuffleCost( | InstructionCost C = TTI->getShuffleCost( | ||||
TTI::SK_PermuteSingleSrc, | TTI::SK_PermuteSingleSrc, | ||||
cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]); | cast<FixedVectorType>(FirstUsers.front()->getType()), | ||||
ShuffleMask.front()); | |||||
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C | LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C | ||||
<< " for final shuffle of insertelement external users " | << " for final shuffle of insertelement external users " | ||||
<< *VectorizableTree.front()->Scalars.front() << ".\n" | << *VectorizableTree.front()->Scalars.front() << ".\n" | ||||
<< "SLP: Current total cost = " << Cost << "\n"); | << "SLP: Current total cost = " << Cost << "\n"); | ||||
Cost += C; | Cost += C; | ||||
continue; | |||||
} | } | ||||
// Other elements - permutation of 2 vectors (the initial one and the next | InstructionCost InsertCost = TTI->getScalarizationOverhead( | ||||
// Ith incoming vector). | cast<FixedVectorType>(FirstUsers.front()->getType()), | ||||
DemandedElts.front(), /*Insert*/ true, /*Extract*/ false); | |||||
LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost | |||||
<< " for insertelements gather.\n" | |||||
<< "SLP: Current total cost = " << Cost << "\n"); | |||||
Cost -= InsertCost; | |||||
} else if (FirstUsers.size() >= 2) { | |||||
Not Done ReplyInline Actionscan FirstUsers be empty here ? RKSimon: can FirstUsers be empty here ? | |||||
Yes, if no insertelement users (check line 5381) or insert index is non-constant (line 5384-5385) ABataev: Yes, if no insertelement users (check line 5381) or insert index is non-constant (line 5384… | |||||
unsigned MaxVF = *std::max_element(VF.begin(), VF.end()); | |||||
// Combined masks of the first 2 vectors. | |||||
SmallVector<int> CombinedMask(MaxVF, UndefMaskElem); | |||||
copy(ShuffleMask.front(), CombinedMask.begin()); | |||||
APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF); | |||||
auto *VecTy = FixedVectorType::get( | |||||
Not Done ReplyInline ActionsWhy perform the OR with zero? RKSimon: Why perform the OR with zero? | |||||
Replaced by an assignment instead ABataev: Replaced by an assignment instead | |||||
cast<VectorType>(FirstUsers.front()->getType())->getElementType(), | |||||
MaxVF); | |||||
for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) { | |||||
if (ShuffleMask[1][I] != UndefMaskElem) { | |||||
CombinedMask[I] = ShuffleMask[1][I] + MaxVF; | |||||
CombinedDemandedElts.setBit(I); | |||||
} | |||||
} | |||||
InstructionCost C = | |||||
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); | |||||
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C | |||||
<< " for final shuffle of vector node and external " | |||||
"insertelement users " | |||||
<< *VectorizableTree.front()->Scalars.front() << ".\n" | |||||
<< "SLP: Current total cost = " << Cost << "\n"); | |||||
Cost += C; | |||||
InstructionCost InsertCost = TTI->getScalarizationOverhead( | |||||
VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false); | |||||
LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost | |||||
<< " for insertelements gather.\n" | |||||
<< "SLP: Current total cost = " << Cost << "\n"); | |||||
Cost -= InsertCost; | |||||
for (int I = 2, E = FirstUsers.size(); I < E; ++I) { | |||||
// Other elements - permutation of 2 vectors (the initial one and the | |||||
// next Ith incoming vector). | |||||
unsigned VF = ShuffleMask[I].size(); | unsigned VF = ShuffleMask[I].size(); | ||||
for (unsigned Idx = 0; Idx < VF; ++Idx) { | for (unsigned Idx = 0; Idx < VF; ++Idx) { | ||||
int &Mask = ShuffleMask[I][Idx]; | int Mask = ShuffleMask[I][Idx]; | ||||
Mask = Mask == UndefMaskElem ? Idx : VF + Mask; | if (Mask != UndefMaskElem) | ||||
} | CombinedMask[Idx] = MaxVF + Mask; | ||||
InstructionCost C = TTI->getShuffleCost( | else if (CombinedMask[Idx] != UndefMaskElem) | ||||
TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()), | CombinedMask[Idx] = Idx; | ||||
ShuffleMask[I]); | } | ||||
LLVM_DEBUG( | for (unsigned Idx = VF; Idx < MaxVF; ++Idx) | ||||
dbgs() | if (CombinedMask[Idx] != UndefMaskElem) | ||||
<< "SLP: Adding cost " << C | CombinedMask[Idx] = Idx; | ||||
<< " for final shuffle of vector node and external insertelement users " | InstructionCost C = | ||||
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); | |||||
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C | |||||
<< " for final shuffle of vector node and external " | |||||
"insertelement users " | |||||
<< *VectorizableTree.front()->Scalars.front() << ".\n" | << *VectorizableTree.front()->Scalars.front() << ".\n" | ||||
<< "SLP: Current total cost = " << Cost << "\n"); | << "SLP: Current total cost = " << Cost << "\n"); | ||||
Cost += C; | Cost += C; | ||||
InstructionCost InsertCost = TTI->getScalarizationOverhead( | InstructionCost InsertCost = TTI->getScalarizationOverhead( | ||||
cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], | cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], | ||||
/*Insert*/ true, | /*Insert*/ true, /*Extract*/ false); | ||||
/*Extract*/ false); | |||||
Cost -= InsertCost; | |||||
LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost | LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost | ||||
<< " for insertelements gather.\n" | << " for insertelements gather.\n" | ||||
<< "SLP: Current total cost = " << Cost << "\n"); | << "SLP: Current total cost = " << Cost << "\n"); | ||||
Cost -= InsertCost; | |||||
} | |||||
} | } | ||||
#ifndef NDEBUG | #ifndef NDEBUG | ||||
SmallString<256> Str; | SmallString<256> Str; | ||||
{ | { | ||||
raw_svector_ostream OS(Str); | raw_svector_ostream OS(Str); | ||||
OS << "SLP: Spill Cost = " << SpillCost << ".\n" | OS << "SLP: Spill Cost = " << SpillCost << ".\n" | ||||
<< "SLP: Extract Cost = " << ExtractCost << ".\n" | << "SLP: Extract Cost = " << ExtractCost << ".\n" | ||||
▲ Show 20 Lines • Show All 4,305 Lines • Show Last 20 Lines |
Pull out this NFC?