diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -326,9 +326,11 @@ /// Estimate the cost of a chain of pointers (typically pointer operands of a /// chain of loads or stores within same block) operations set when lowered. + /// \p MemOpTy is the type of the loads/stores that will ultimately use the \p + /// Ptrs. InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, - const PointersChainInfo &Info, + const PointersChainInfo &Info, Type *MemOpTy, TargetCostKind CostKind = TTI::TCK_RecipThroughput ) const; @@ -1669,7 +1671,7 @@ TTI::TargetCostKind CostKind) = 0; virtual InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, - const TTI::PointersChainInfo &Info, + const TTI::PointersChainInfo &Info, Type *MemOpTy, TTI::TargetCostKind CostKind) = 0; virtual unsigned getInliningThresholdMultiplier() = 0; virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0; @@ -2030,8 +2032,9 @@ InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, const PointersChainInfo &Info, + Type *MemOpTy, TargetCostKind CostKind) override { - return Impl.getPointersChainCost(Ptrs, Base, Info, CostKind); + return Impl.getPointersChainCost(Ptrs, Base, Info, MemOpTy, CostKind); } unsigned getInliningThresholdMultiplier() override { return Impl.getInliningThresholdMultiplier(); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1041,6 +1041,7 @@ InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, + Type *MemOpTy, TTI::TargetCostKind CostKind) { InstructionCost Cost = TTI::TCC_Free; // In the basic model we take into account GEP instructions only @@ -1053,13 +1054,24 @@ // any their index is a non-const. // If no known dependecies between the pointers cost is calculated as a sum // of costs of GEP instructions. - for (const Value *V : Ptrs) { + for (auto [I, V] : enumerate(Ptrs)) { const auto *GEP = dyn_cast(V); if (!GEP) continue; if (Info.isSameBase() && V != Base) { if (GEP->hasAllConstantIndices()) continue; + // If the chain is unit-stride and BaseReg + stride*i is a legal + // addressing mode, then we don't need an extra ADD operation + unsigned Stride = DL.getTypeStoreSize(MemOpTy); + if (Info.isUniformStride() && + static_cast(this)->isLegalAddressingMode( + MemOpTy, + /* BaseGV */ nullptr, + /* BaseOffset */ Stride * I, + /* HasBaseReg */ true, + /* Scale */ 0, GEP->getType()->getPointerAddressSpace())) + continue; Cost += static_cast(this)->getArithmeticInstrCost( Instruction::Add, GEP->getType(), CostKind, {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -230,10 +230,11 @@ InstructionCost TargetTransformInfo::getPointersChainCost( ArrayRef Ptrs, const Value *Base, - const TTI::PointersChainInfo &Info, TTI::TargetCostKind CostKind) const { + const TTI::PointersChainInfo &Info, Type *MemOpTy, + TTI::TargetCostKind CostKind) const { assert((Base || !Info.isSameBase()) && "If pointers have same base address it has to be provided."); - return TTIImpl->getPointersChainCost(Ptrs, Base, Info, CostKind); + return TTIImpl->getPointersChainCost(Ptrs, Base, Info, MemOpTy, CostKind); } unsigned TargetTransformInfo::getEstimatedNumberOfCaseClusters( diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -177,10 +177,6 @@ Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I); - InstructionCost getPointersChainCost(ArrayRef Ptrs, - const Value *Base, - const TTI::PointersChainInfo &Info, - TTI::TargetCostKind CostKind); InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4943,23 +4943,6 @@ return Cost + LT.first; } -InstructionCost X86TTIImpl::getPointersChainCost( - ArrayRef Ptrs, const Value *Base, - const TTI::PointersChainInfo &Info, TTI::TargetCostKind CostKind) { - if (Info.isSameBase() && Info.isKnownStride()) { - // If all the pointers have known stride all the differences are translated - // into constants. X86 memory addressing allows encoding it into - // displacement. So we just need to take the base GEP cost. - if (const auto *BaseGEP = dyn_cast(Base)) { - SmallVector Indices(BaseGEP->indices()); - return getGEPCost(BaseGEP->getSourceElementType(), - BaseGEP->getPointerOperand(), Indices, CostKind); - } - return TTI::TCC_Free; - } - return BaseT::getPointersChainCost(Ptrs, Base, Info, CostKind); -} - InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7265,7 +7265,7 @@ // loads/stores. ScalarCost = TTI->getPointersChainCost( Ptrs, BasePtr, TTI::PointersChainInfo::getKnownUniformStrided(), - CostKind); + ScalarTy, CostKind); SmallVector PtrsRetainedInVecCode; for (Value *V : Ptrs) { @@ -7291,7 +7291,7 @@ } VecCost = TTI->getPointersChainCost( PtrsRetainedInVecCode, BasePtr, - TTI::PointersChainInfo::getKnownNonUniformStrided(), CostKind); + TTI::PointersChainInfo::getKnownNonUniformStrided(), VecTy, CostKind); } else { // Case 1: Ptrs are the arguments of loads that we are going to transform // into masked gather load intrinsic. @@ -7307,7 +7307,8 @@ ? TTI::PointersChainInfo::getNonUniformStrided() : TTI::PointersChainInfo::getKnownNonUniformStrided(); - ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, CostKind); + ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, + CostKind); // Remark: it not quite correct to use scalar GEP cost for a vector GEP, // but it's not clear how to do that without having vector GEP arguments diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll @@ -2,7 +2,8 @@ ; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \ ; RUN: -riscv-v-slp-max-vf=0 -S | FileCheck %s -; FIXME: This should not be vectorized +; This shouldn't be vectorized as the extra address computation required for the +; vector instructions make it unprofitable %struct.2i32 = type { i32, i32 } @@ -10,7 +11,9 @@ ; CHECK-LABEL: @splat_store_v2i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = getelementptr [[STRUCT_2I32:%.*]], ptr [[DEST:%.*]], i64 [[I:%.*]], i32 0 -; CHECK-NEXT: store <2 x i32> , ptr [[P1]], align 4 +; CHECK-NEXT: store i32 1, ptr [[P1]], align 4 +; CHECK-NEXT: [[P2:%.*]] = getelementptr [[STRUCT_2I32]], ptr [[DEST]], i64 [[I]], i32 1 +; CHECK-NEXT: store i32 1, ptr [[P2]], align 4 ; CHECK-NEXT: ret void ; entry: