diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -275,6 +275,51 @@ ArrayRef Operands, TargetCostKind CostKind = TCK_SizeAndLatency) const; + /// Describe known properties for a set of pointers. + struct PointersChainInfo { + /// All the GEPs in a set have same base address. + bool IsSameBaseAddress : 1; + /// These properties only valid if SameBaseAddress is set. + /// True if distance between any two neigbouring pointers is same value. + bool IsUniformStride : 1; + /// True if distance between any two neigbouring pointers is a known value. + bool IsKnownStride : 1; + unsigned Reserved : 29; + + bool isSameBase() const { return IsSameBaseAddress; } + bool isUniformStride() const { + return IsSameBaseAddress && IsUniformStride; + } + bool isKnownStride() const { return IsSameBaseAddress && IsKnownStride; } + + static PointersChainInfo getKnownUniformStrided() { + return {/*IsSameBaseAddress=*/true, /*IsUniformStride=*/true, + /*IsKnownStride=*/true}; + } + static PointersChainInfo getUniformStrided() { + return {/*IsSameBaseAddress=*/true, /*IsUniformStride=*/true, + /*IsKnownStride=*/false}; + } + static PointersChainInfo getKnownNonUniformStrided() { + return {/*IsSameBaseAddress=*/true, /*IsUniformStride=*/false, + /*IsKnownStride=*/true}; + } + static PointersChainInfo getNonUniformStrided() { + return {/*IsSameBaseAddress=*/true, /*IsUniformStride=*/false, + /*IsKnownStride=*/false}; + } + }; + + /// Estimate the cost of a chain of pointers (typically pointer operands of a + /// chain of loads or stores) operations set when lowered. All the GEPs must + /// belong to same basic block. + InstructionCost + getPointersChainCost(ArrayRef Ptrs, const Value *Base, + const PointersChainInfo &Info, + TargetCostKind CostKind = TTI::TCK_RecipThroughput + + ) const; + /// \returns A value by which our inlining threshold should be multiplied. /// This is primarily used to bump up the inlining threshold wholesale on /// targets where calls are unusually expensive. @@ -1602,6 +1647,10 @@ virtual InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands, TTI::TargetCostKind CostKind) = 0; + virtual InstructionCost + getPointersChainCost(ArrayRef Ptrs, const Value *Base, + const TTI::PointersChainInfo &Info, + TTI::TargetCostKind CostKind) = 0; virtual unsigned getInliningThresholdMultiplier() = 0; virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0; virtual int getInlinerVectorBonusPercent() = 0; @@ -1959,6 +2008,12 @@ TargetTransformInfo::TargetCostKind CostKind) override { return Impl.getGEPCost(PointeeType, Ptr, Operands, CostKind); } + InstructionCost getPointersChainCost(ArrayRef Ptrs, + const Value *Base, + const PointersChainInfo &Info, + TargetCostKind CostKind) override { + return Impl.getPointersChainCost(Ptrs, Base, Info, CostKind); + } unsigned getInliningThresholdMultiplier() override { return Impl.getInliningThresholdMultiplier(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1022,6 +1022,47 @@ return TTI::TCC_Basic; } + InstructionCost getPointersChainCost(ArrayRef Ptrs, + const Value *Base, + const TTI::PointersChainInfo &Info, + TTI::TargetCostKind CostKind) { + InstructionCost Cost = TTI::TCC_Free; + + if (const auto *BaseGEP = dyn_cast(Base)) { + SmallVector Indices(BaseGEP->indices()); + Cost += static_cast(this)->getGEPCost( + BaseGEP->getSourceElementType(), BaseGEP->getPointerOperand(), + Indices, CostKind); + } + + // In the basic model we only take into account whether all the GEPs have + // same base address. If all pointers are relative to same base address we + // just calculate each as an ADD operation when any index is a non-const. + // Otherwise it is just a set of GEPs with no known dependecies. + for (const Value *V : Ptrs) { + if (V == Base) + continue; + const auto *GEP = dyn_cast(V); + if (!GEP) + continue; + + if (Info.isSameBase()) { + if (GEP->hasAllConstantIndices()) + continue; + Cost += static_cast(this)->getArithmeticInstrCost( + Instruction::Add, GEP->getType(), CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, + std::nullopt); + } else { + SmallVector Indices(GEP->indices()); + Cost += static_cast(this)->getGEPCost(GEP->getSourceElementType(), + GEP->getPointerOperand(), + Indices, CostKind); + } + } + return Cost; + } + InstructionCost getInstructionCost(const User *U, ArrayRef Operands, TTI::TargetCostKind CostKind) { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -220,6 +220,12 @@ return TTIImpl->getGEPCost(PointeeType, Ptr, Operands, CostKind); } +InstructionCost TargetTransformInfo::getPointersChainCost( + ArrayRef Ptrs, const Value *Base, + const TTI::PointersChainInfo &Info, TTI::TargetCostKind CostKind) const { + return TTIImpl->getPointersChainCost(Ptrs, Base, Info, CostKind); +} + unsigned TargetTransformInfo::getEstimatedNumberOfCaseClusters( const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7151,37 +7151,81 @@ // Calculate cost difference from vectorizing set of GEPs. // Negative value means vectorizing is profitable. auto GetGEPCostDiff = [=](ArrayRef Ptrs, Value *BasePtr) { - InstructionCost CostSavings = 0; - for (Value *V : Ptrs) { - if (V == BasePtr) - continue; - auto *Ptr = dyn_cast(V); - // GEPs may contain just addresses without instructions, considered free. - // GEPs with all constant indices also considered to have zero cost. - if (!Ptr || Ptr->hasAllConstantIndices()) - continue; + InstructionCost ScalarCost = 0; + InstructionCost VecCost = 0; + // Here we differentiate two cases: (1) when Ptrs represent a regular + // vectorization tree node (as they are pointer arguments of scattered + // loads) or (2) when Ptrs are the arguments of loads or stores being + // vectorized as plane wide unit-stride load/store since all the + // loads/stores are known to be from/to adjacent locations. + if (isa(VL0)) { + // Case 2: estimate costs for pointer related costs when vectorizing to + // a wide load/store. + // Scalar cost is estimated as a set of pointers with known relationship + // between them. + // For vector code we will use BasePtr as argument for the wide load/store + // but we also need to account all the instructions which are going to + // stay in vectorized code due to uses outside of these scalar + // loads/stores. + ScalarCost = TTI->getPointersChainCost( + Ptrs, BasePtr, TTI::PointersChainInfo::getKnownUniformStrided(), + CostKind); + + SmallVector PtrsRetainedInVecCode; + for (Value *V : Ptrs) { + if (V == BasePtr) { + PtrsRetainedInVecCode.push_back(V); + continue; + } + auto *Ptr = dyn_cast(V); + // For simplicity assume Ptr to stay in vectorized code if it's not a + // GEP instruction. We don't care since it's cost considered free. + // TODO: We should check for any uses outside of vectorizable tree + // rather than just single use. + if (!Ptr || !Ptr->hasOneUse()) + PtrsRetainedInVecCode.push_back(V); + } - // Here we differentiate two cases: when GEPs represent a regular - // vectorization tree node (and hence vectorized) and when the set is - // arguments of a set of loads or stores being vectorized. In the former - // case all the scalar GEPs will be removed as a result of vectorization. + // If all pointers stay in vectorized code then we don't have + // savings on that. + if (PtrsRetainedInVecCode.size() == Ptrs.size()) + VecCost = ScalarCost; + else + VecCost = TTI->getPointersChainCost( + PtrsRetainedInVecCode, BasePtr, + TTI::PointersChainInfo::getKnownNonUniformStrided(), CostKind); + } else { + // Case 1: Ptrs are the arguments of loads that we are going to transform + // into masked gather load intrinsic. + // All the scalar GEPs will be removed as a result of vectorization. // For any external uses of some lanes extract element instructions will - // be generated (which cost is estimated separately). For the latter case - // since the set of GEPs itself is not vectorized those used more than - // once will remain staying in vectorized code as well. So we should not - // count them as savings. - if (!Ptr->hasOneUse() && isa(VL0)) - continue; + // be generated (which cost is estimated separately). + TTI::PointersChainInfo PtrsInfo = + all_of(Ptrs, + [](const Value *V) { + auto *Ptr = dyn_cast(V); + return Ptr && !Ptr->hasAllConstantIndices(); + }) + ? TTI::PointersChainInfo::getNonUniformStrided() + : TTI::PointersChainInfo::getKnownNonUniformStrided(); + + ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, CostKind); - // TODO: it is target dependent, so need to implement and then use a TTI - // interface. - CostSavings += TTI->getArithmeticInstrCost(Instruction::Add, - Ptr->getType(), CostKind); + // Remark: it not quite correct to use scalar GEP cost for a vector GEP, + // but it's not clear how to do that without having vector GEP arguments + // ready. + // Perhaps using just TTI::TCC_Free/TTI::TCC_Basic would be better option. + if (const auto *Base = dyn_cast(BasePtr)) { + SmallVector Indices(Base->indices()); + VecCost = TTI->getGEPCost(Base->getSourceElementType(), + Base->getPointerOperand(), Indices, CostKind); + } } - LLVM_DEBUG(dbgs() << "SLP: Calculated GEPs cost savings or Tree:\n"; - E->dump()); - LLVM_DEBUG(dbgs() << "SLP: GEP cost saving = " << CostSavings << "\n"); - return InstructionCost() - CostSavings; + + LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost, + "Calculated GEPs cost for Tree")); + + return VecCost - ScalarCost; }; switch (ShuffleOrOp) {