diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1080,6 +1080,7 @@ class BoUpSLP { struct TreeEntry; struct ScheduleData; + class ShuffleCostEstimator; class ShuffleInstructionBuilder; public: @@ -6792,43 +6793,38 @@ }; } // namespace -InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, - ArrayRef VectorizedVals) { - ArrayRef VL = E->Scalars; - - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - else if (CmpInst *CI = dyn_cast(VL[0])) - ScalarTy = CI->getOperand(0)->getType(); - else if (auto *IE = dyn_cast(VL[0])) - ScalarTy = IE->getOperand(1)->getType(); - auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - - // If we have computed a smaller type for the expression, update VecTy so - // that the costs will be accurate. - if (MinBWs.count(VL[0])) - VecTy = FixedVectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); - unsigned EntryVF = E->getVectorFactor(); - auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF); +/// Merges shuffle masks and emits final shuffle instruction, if required. It +/// supports shuffling of 2 input vectors. It implements lazy shuffles emission, +/// when the actual shuffle instruction is generated only if this is actually +/// required. Otherwise, the shuffle instruction emission is delayed till the +/// end of the process, to reduce the number of emitted instructions and further +/// analysis/transformations. +class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { + bool IsFinalized = false; + const TargetTransformInfo &TTI; + InstructionCost Cost = 0; + ArrayRef VectorizedVals; + BoUpSLP &R; + constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - // FIXME: it tries to fix a problem with MSVC buildbots. - TargetTransformInfo *TTI = this->TTI; - auto AdjustExtractsCost = [=](InstructionCost &Cost, - ArrayRef Mask) -> Value * { +public: + ShuffleCostEstimator(TargetTransformInfo &TTI, + ArrayRef VectorizedVals, BoUpSLP &R) + : TTI(TTI), VectorizedVals(VectorizedVals), R(R) {} + Value *adjustExtracts(const TreeEntry *E, ArrayRef Mask) { if (Mask.empty()) return nullptr; Value *VecBase = nullptr; + ArrayRef VL = E->Scalars; + auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); // If the resulting type is scalarized, do not adjust the cost. - unsigned VecNumParts = TTI->getNumberOfParts(VecTy); + unsigned VecNumParts = TTI.getNumberOfParts(VecTy); if (VecNumParts == VecTy->getNumElements()) return nullptr; DenseMap ExtractVectorsTys; SmallPtrSet CheckedExtracts; for (auto [I, V] : enumerate(VL)) { + // Ignore non-extractelement scalars. if (isa(V) || (!Mask.empty() && Mask[I] == UndefMaskElem)) continue; // If all users of instruction are going to be vectorized and this @@ -6837,9 +6833,9 @@ // vectorized tree. // Also, avoid adjusting the cost for extractelements with multiple uses // in different graph entries. - const TreeEntry *VE = getTreeEntry(V); + const TreeEntry *VE = R.getTreeEntry(V); if (!CheckedExtracts.insert(V).second || - !areAllUsersVectorized(cast(V), VectorizedVals) || + !R.areAllUsersVectorized(cast(V), VectorizedVals) || (VE && VE != E)) continue; auto *EE = cast(V); @@ -6848,7 +6844,7 @@ if (!EEIdx) continue; unsigned Idx = *EEIdx; - if (VecNumParts != TTI->getNumberOfParts(EE->getVectorOperandType())) { + if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) { auto It = ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; It->getSecond() = std::min(It->second, Idx); @@ -6861,18 +6857,17 @@ })) { // Use getExtractWithExtendCost() to calculate the cost of // extractelement/ext pair. - Cost -= - TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), - EE->getVectorOperandType(), Idx); + Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), + EE->getVectorOperandType(), Idx); // Add back the cost of s|zext which is subtracted separately. - Cost += TTI->getCastInstrCost( + Cost += TTI.getCastInstrCost( Ext->getOpcode(), Ext->getType(), EE->getType(), TTI::getCastContextHint(Ext), CostKind, Ext); continue; } } - Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, - Idx); + Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, + Idx); } // Add a cost for subvector extracts/inserts if required. for (const auto &Data : ExtractVectorsTys) { @@ -6880,37 +6875,70 @@ unsigned NumElts = VecTy->getNumElements(); if (Data.second % NumElts == 0) continue; - if (TTI->getNumberOfParts(EEVTy) > VecNumParts) { + if (TTI.getNumberOfParts(EEVTy) > VecNumParts) { unsigned Idx = (Data.second / NumElts) * NumElts; unsigned EENumElts = EEVTy->getNumElements(); if (Idx % NumElts == 0) continue; if (Idx + NumElts <= EENumElts) { - Cost += - TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, VecTy); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, std::nullopt, CostKind, Idx, VecTy); } else { // Need to round up the subvector type vectorization factor to avoid a // crash in cost model functions. Make SubVT so that Idx + VF of SubVT // <= EENumElts. auto *SubVT = FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); - Cost += - TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, SubVT); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, std::nullopt, CostKind, Idx, SubVT); } } else { - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector, - VecTy, std::nullopt, CostKind, 0, EEVTy); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, + VecTy, std::nullopt, CostKind, 0, EEVTy); } } return VecBase; - }; + } + /// Finalize emission of the shuffles. + InstructionCost finalize() { + IsFinalized = true; + return Cost; + } + + ~ShuffleCostEstimator() { + assert(IsFinalized && "Shuffle construction must be finalized."); + } +}; + +InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, + ArrayRef VectorizedVals) { + ArrayRef VL = E->Scalars; + + Type *ScalarTy = VL[0]->getType(); + if (auto *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + else if (auto *CI = dyn_cast(VL[0])) + ScalarTy = CI->getOperand(0)->getType(); + else if (auto *IE = dyn_cast(VL[0])) + ScalarTy = IE->getOperand(1)->getType(); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + // If we have computed a smaller type for the expression, update VecTy so + // that the costs will be accurate. + if (MinBWs.count(VL[0])) + VecTy = FixedVectorType::get( + IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + unsigned EntryVF = E->getVectorFactor(); + auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF); + + bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; if (isa(VL[0])) return InstructionCost::getInvalid(); + ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this); unsigned VF = E->getVectorFactor(); SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), E->ReuseShuffleIndices.end()); @@ -6933,15 +6961,15 @@ if (UserIgnoreList) IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); - InstructionCost Cost = 0; bool Resized = false; - if (Value *VecBase = AdjustExtractsCost(Cost, ExtractMask)) + if (Value *VecBase = Estimator.adjustExtracts(E, ExtractMask)) if (auto *VecBaseTy = dyn_cast(VecBase->getType())) if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { Resized = true; GatheredScalars.append(VF - GatheredScalars.size(), PoisonValue::get(ScalarTy)); } + InstructionCost ExtractCost = Estimator.finalize(); // Do not try to look for reshuffled loads for gathered loads (they will be // handled later), for vectorized scalars, and cases, which are definitely @@ -7003,11 +7031,10 @@ // single input vector or of 2 input vectors. InstructionCost Cost = computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI); - AdjustExtractsCost(Cost, ExtractMask); if (NeedToShuffleReuses) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); - return Cost; + return Cost + ExtractCost; } if (isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as the