diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -275,6 +275,45 @@ ArrayRef Operands, TargetCostKind CostKind = TCK_SizeAndLatency) const; + /// Describe known properties for a set of pointers. + struct PointersChainInfo { + /// All the GEPs in a set have same base address. + bool IsSameBaseAddress = false; + + /// These properties only valid if SameBaseAddress is set. + /// True if distance between any two neigbouring pointers is same value. + bool IsUniformStride = false; + /// True if distance between any two neigbouring pointers is a known value. + bool IsKnownStride = false; + + bool isSameBase() const { return IsSameBaseAddress; } + bool isUniformStride() const { + return IsSameBaseAddress && IsUniformStride; + } + bool isKnownStride() const { return IsSameBaseAddress && IsKnownStride; } + + static PointersChainInfo getKnownUniformStrided() { + return {true, true, true}; + } + static PointersChainInfo getUniformStrided() { return {true, true, false}; } + static PointersChainInfo getKnownNonUniformStrided() { + return {true, false, true}; + } + static PointersChainInfo getNonUniformStrided() { + return {true, false, false}; + } + }; + + /// Estimate the cost of a chain of pointers (typically pointer operands of a + /// chain of loads or stores) operations set when lowered. All the GEPs must + /// belong to same basic block. + InstructionCost + getPointersChainCost(ArrayRef Ptrs, const Value *Base, + PointersChainInfo Info, + TargetCostKind CostKind = TTI::TCK_RecipThroughput + + ) const; + /// \returns A value by which our inlining threshold should be multiplied. /// This is primarily used to bump up the inlining threshold wholesale on /// targets where calls are unusually expensive. @@ -1602,6 +1641,10 @@ virtual InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands, TTI::TargetCostKind CostKind) = 0; + virtual InstructionCost + getPointersChainCost(ArrayRef Ptrs, const Value *Base, + TTI::PointersChainInfo Info, + TTI::TargetCostKind CostKind) = 0; virtual unsigned getInliningThresholdMultiplier() = 0; virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0; virtual int getInlinerVectorBonusPercent() = 0; @@ -1959,6 +2002,12 @@ TargetTransformInfo::TargetCostKind CostKind) override { return Impl.getGEPCost(PointeeType, Ptr, Operands, CostKind); } + InstructionCost getPointersChainCost(ArrayRef Ptrs, + const Value *Base, + PointersChainInfo Info, + TargetCostKind CostKind) override { + return Impl.getPointersChainCost(Ptrs, Base, Info, CostKind); + } unsigned getInliningThresholdMultiplier() override { return Impl.getInliningThresholdMultiplier(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1022,6 +1022,45 @@ return TTI::TCC_Basic; } + InstructionCost getPointersChainCost(ArrayRef Ptrs, + const Value *Base, + TTI::PointersChainInfo Info, + TTI::TargetCostKind CostKind) { + InstructionCost Cost = TTI::TCC_Free; + + if (const auto *BaseGEP = dyn_cast(Base)) { + SmallVector Indices(BaseGEP->indices()); + Cost += static_cast(this)->getGEPCost( + BaseGEP->getSourceElementType(), BaseGEP->getPointerOperand(), + Indices, CostKind); + } + + // In the basic model we only take into account whether all the GEPs have + // same base address. + for (const Value *V : Ptrs) { + if (V == Base) + continue; + const auto *GEP = dyn_cast(V); + if (!GEP) + continue; + + if (Info.isSameBase()) { + if (GEP->hasAllConstantIndices()) + continue; + Cost += static_cast(this)->getArithmeticInstrCost( + Instruction::Add, GEP->getType(), CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, + std::nullopt); + } else { + SmallVector Indices(GEP->indices()); + Cost += static_cast(this)->getGEPCost(GEP->getSourceElementType(), + GEP->getPointerOperand(), + Indices, CostKind); + } + } + return Cost; + } + InstructionCost getInstructionCost(const User *U, ArrayRef Operands, TTI::TargetCostKind CostKind) { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -220,6 +220,12 @@ return TTIImpl->getGEPCost(PointeeType, Ptr, Operands, CostKind); } +InstructionCost TargetTransformInfo::getPointersChainCost( + ArrayRef Ptrs, const Value *Base, + TTI::PointersChainInfo Info, TTI::TargetCostKind CostKind) const { + return TTIImpl->getPointersChainCost(Ptrs, Base, Info, CostKind); +} + unsigned TargetTransformInfo::getEstimatedNumberOfCaseClusters( const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2812,9 +2812,13 @@ #ifndef NDEBUG void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost, - InstructionCost VecCost, - InstructionCost ScalarCost) const { - dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump(); + InstructionCost VecCost, InstructionCost ScalarCost, + const char *Banner = nullptr) const { + if (Banner) + dbgs() << "SLP: " << Banner << ":\n"; + else + dbgs() << "SLP: Calculated costs for Tree:\n"; + E->dump(); dbgs() << "SLP: Costs:\n"; dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n"; dbgs() << "SLP: VectorCost = " << VecCost << "\n"; @@ -7141,46 +7145,70 @@ InstructionCost VecCost = VectorCost(CommonCost); LLVM_DEBUG( dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost)); - // Disable warnings for `this` and `E` are unused. Required for - // `dumpTreeCosts`. - (void)this; - (void)E; return VecCost - ScalarCost; }; // Calculate cost difference from vectorizing set of GEPs. // Negative value means vectorizing is profitable. auto GetGEPCostDiff = [=](ArrayRef Ptrs, Value *BasePtr) { - InstructionCost CostSavings = 0; - for (Value *V : Ptrs) { - if (V == BasePtr) - continue; - auto *Ptr = dyn_cast(V); - // GEPs may contain just addresses without instructions, considered free. - // GEPs with all constant indices also considered to have zero cost. - if (!Ptr || Ptr->hasAllConstantIndices()) - continue; + InstructionCost ScalarCost = 0; + InstructionCost VecCost = 0; + // Here we differentiate two cases: when GEPs represent a regular + // vectorization tree node (and hence vectorized) and when the set is + // arguments of a set of loads or stores being vectorized. In the former + // case all the scalar GEPs will be removed as a result of vectorization. + // For any external uses of some lanes extract element instructions will + // be generated (which cost is estimated separately). For the latter case + // since the set of GEPs itself is not vectorized those used more than + // once will remain staying in vectorized code as well. So we should not + // count them as savings. + if (isa(VL0)) { + ScalarCost = TTI->getPointersChainCost( + Ptrs, BasePtr, TTI::PointersChainInfo::getKnownUniformStrided(), + CostKind); + + SmallVector MultipleUseGEPs; + for (Value *V : Ptrs) { + if (V == BasePtr) { + MultipleUseGEPs.push_back(V); + continue; + } + auto *Ptr = dyn_cast(V); + if (Ptr && !Ptr->hasOneUse()) + MultipleUseGEPs.push_back(V); + } - // Here we differentiate two cases: when GEPs represent a regular - // vectorization tree node (and hence vectorized) and when the set is - // arguments of a set of loads or stores being vectorized. In the former - // case all the scalar GEPs will be removed as a result of vectorization. - // For any external uses of some lanes extract element instructions will - // be generated (which cost is estimated separately). For the latter case - // since the set of GEPs itself is not vectorized those used more than - // once will remain staying in vectorized code as well. So we should not - // count them as savings. - if (!Ptr->hasOneUse() && isa(VL0)) - continue; + // If all pointers stay in vectorized code then we don't have + // savings on that. + if (MultipleUseGEPs.size() == Ptrs.size()) + VecCost = ScalarCost; + else + VecCost = TTI->getPointersChainCost( + MultipleUseGEPs, BasePtr, + TTI::PointersChainInfo::getKnownNonUniformStrided(), CostKind); + } else { + if (all_of(Ptrs, [](const Value *V) { + auto *Ptr = dyn_cast(V); + return !Ptr || Ptr->hasAllConstantIndices(); + })) + ScalarCost = TTI->getPointersChainCost( + Ptrs, BasePtr, TTI::PointersChainInfo::getKnownNonUniformStrided(), + CostKind); + else + ScalarCost = TTI->getPointersChainCost( + Ptrs, BasePtr, TTI::PointersChainInfo::getNonUniformStrided(), + CostKind); - // TODO: it is target dependent, so need to implement and then use a TTI - // interface. - CostSavings += TTI->getArithmeticInstrCost(Instruction::Add, - Ptr->getType(), CostKind); + if (const auto *Base = dyn_cast(BasePtr)) { + SmallVector Indices(Base->indices()); + VecCost = TTI->getGEPCost(Base->getSourceElementType(), + Base->getPointerOperand(), Indices, CostKind); + } } - LLVM_DEBUG(dbgs() << "SLP: Calculated GEPs cost savings or Tree:\n"; - E->dump()); - LLVM_DEBUG(dbgs() << "SLP: GEP cost saving = " << CostSavings << "\n"); - return InstructionCost() - CostSavings; + + LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost, + "Calculated GEPs cost for Tree")); + + return VecCost - ScalarCost; }; switch (ShuffleOrOp) {