diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -322,9 +322,11 @@ /// Estimate the cost of a chain of pointers (typically pointer operands of a /// chain of loads or stores within same block) operations set when lowered. + /// \p AccessTy is the type of the loads/stores that will ultimately use the + /// \p Ptrs. InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, - const PointersChainInfo &Info, + const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind = TTI::TCK_RecipThroughput ) const; @@ -1665,7 +1667,7 @@ TTI::TargetCostKind CostKind) = 0; virtual InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, - const TTI::PointersChainInfo &Info, + const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) = 0; virtual unsigned getInliningThresholdMultiplier() = 0; virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0; @@ -2026,8 +2028,9 @@ InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, const PointersChainInfo &Info, + Type *AccessTy, TargetCostKind CostKind) override { - return Impl.getPointersChainCost(Ptrs, Base, Info, CostKind); + return Impl.getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); } unsigned getInliningThresholdMultiplier() override { return Impl.getInliningThresholdMultiplier(); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1041,6 +1041,7 @@ InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, + Type *AccessTy, TTI::TargetCostKind CostKind) { InstructionCost Cost = TTI::TCC_Free; // In the basic model we take into account GEP instructions only @@ -1050,15 +1051,28 @@ // pointers are relative to the same base address, all the rest are // either GEP instructions, PHIs, bitcasts or constants. When we have same // base, we just calculate cost of each non-Base GEP as an ADD operation if - // any their index is a non-const. + // any their index isn't foldable. // If no known dependecies between the pointers cost is calculated as a sum // of costs of GEP instructions. - for (const Value *V : Ptrs) { + for (auto [I, V] : enumerate(Ptrs)) { const auto *GEP = dyn_cast(V); if (!GEP) continue; if (Info.isSameBase() && V != Base) { - if (GEP->hasAllConstantIndices()) + if (GEP->hasAllConstantIndices()) + continue; + // If the chain is unit-stride and BaseReg + stride*i is a legal + // addressing mode, then presume the base GEP is sitting around in a + // register somewhere and check if we can fold the offset relative to + // it. + unsigned Stride = DL.getTypeStoreSize(AccessTy); + if (Info.isUniformStride() && + static_cast(this)->isLegalAddressingMode( + AccessTy, + /* BaseGV */ nullptr, + /* BaseOffset */ Stride * I, + /* HasBaseReg */ true, + /* Scale */ 0, GEP->getType()->getPointerAddressSpace())) continue; Cost += static_cast(this)->getArithmeticInstrCost( Instruction::Add, GEP->getType(), CostKind, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -230,10 +230,11 @@ InstructionCost TargetTransformInfo::getPointersChainCost( ArrayRef Ptrs, const Value *Base, - const TTI::PointersChainInfo &Info, TTI::TargetCostKind CostKind) const { + const TTI::PointersChainInfo &Info, Type *AccessTy, + TTI::TargetCostKind CostKind) const { assert((Base || !Info.isSameBase()) && "If pointers have same base address it has to be provided."); - return TTIImpl->getPointersChainCost(Ptrs, Base, Info, CostKind); + return TTIImpl->getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); } unsigned TargetTransformInfo::getEstimatedNumberOfCaseClusters( diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -180,6 +180,7 @@ InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, + Type *AccessTy, TTI::TargetCostKind CostKind); InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4943,9 +4943,11 @@ return Cost + LT.first; } -InstructionCost X86TTIImpl::getPointersChainCost( - ArrayRef Ptrs, const Value *Base, - const TTI::PointersChainInfo &Info, TTI::TargetCostKind CostKind) { +InstructionCost +X86TTIImpl::getPointersChainCost(ArrayRef Ptrs, + const Value *Base, + const TTI::PointersChainInfo &Info, + Type *AccessTy, TTI::TargetCostKind CostKind) { if (Info.isSameBase() && Info.isKnownStride()) { // If all the pointers have known stride all the differences are translated // into constants. X86 memory addressing allows encoding it into @@ -4957,7 +4959,7 @@ } return TTI::TCC_Free; } - return BaseT::getPointersChainCost(Ptrs, Base, Info, CostKind); + return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); } InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7264,7 +7264,8 @@ // stay in vectorized code due to uses outside of these scalar // loads/stores. ScalarCost = TTI->getPointersChainCost( - Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), CostKind); + Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy, + CostKind); SmallVector PtrsRetainedInVecCode; for (Value *V : Ptrs) { @@ -7290,7 +7291,7 @@ } VecCost = TTI->getPointersChainCost( PtrsRetainedInVecCode, BasePtr, - TTI::PointersChainInfo::getKnownStride(), CostKind); + TTI::PointersChainInfo::getKnownStride(), VecTy, CostKind); } else { // Case 1: Ptrs are the arguments of loads that we are going to transform // into masked gather load intrinsic. @@ -7306,7 +7307,8 @@ ? TTI::PointersChainInfo::getUnknownStride() : TTI::PointersChainInfo::getKnownStride(); - ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, CostKind); + ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, + CostKind); // Remark: it not quite correct to use scalar GEP cost for a vector GEP, // but it's not clear how to do that without having vector GEP arguments diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/getpointerschaincost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/getpointerschaincost.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/getpointerschaincost.ll @@ -0,0 +1,67 @@ +; RUN: opt -mtriple=riscv64 -mattr=+v -riscv-v-slp-max-vf=0 -passes=slp-vectorizer -disable-output -debug-only=SLP < %s 2>&1 | FileCheck %s + +; Because all of these addresses are foldable, the scalar cost should be 0 when +; computing the pointers chain cost. +define void @f(ptr %dest, i64 %i) { +; CHECK-LABEL: SLP: Analyzing blocks in f. +; CHECK: SLP: Calculated GEPs cost for Tree: +; CHECK: SLP: Costs: +; CHECK: SLP: ReuseShuffleCost = 0 +; CHECK-NEXT: SLP: VectorCost = 0 +; CHECK-NEXT: SLP: ScalarCost = 0 +; CHECK-NEXT: SLP: ReuseShuffleCost + VecCost - ScalarCost = 0 +entry: + %p1 = getelementptr i32, ptr %dest, i32 0 + store i32 1, ptr %p1 + %p2 = getelementptr i32, ptr %dest, i32 1 + store i32 1, ptr %p2 + %p3 = getelementptr i32, ptr %dest, i32 2 + store i32 1, ptr %p3 + %p4 = getelementptr i32, ptr %dest, i32 3 + store i32 1, ptr %p4 + ret void +} + +; When computing the scalar pointers chain cost here, there is a cost of 1 for +; the base pointer, and the rest can be folded in, so the scalar cost should be +; 1. +define void @g(ptr %dest, i64 %i) { +; CHECK-LABEL: SLP: Analyzing blocks in g. +; CHECK: SLP: Calculated GEPs cost for Tree: +; CHECK: SLP: Costs: +; CHECK-NEXT: SLP: ReuseShuffleCost = 0 +; CHECK-NEXT: SLP: VectorCost = 1 +; CHECK-NEXT: SLP: ScalarCost = 1 +entry: + %p1 = getelementptr i32, ptr %dest, i32 2048 + store i32 1, ptr %p1 + %p2 = getelementptr i32, ptr %dest, i32 2049 + store i32 1, ptr %p2 + %p3 = getelementptr i32, ptr %dest, i32 2050 + store i32 1, ptr %p3 + %p4 = getelementptr i32, ptr %dest, i32 2051 + store i32 1, ptr %p4 + ret void +} + +; When computing the scalar pointers chain cost here, there is a cost of 1 for +; the base pointer, and the rest can be folded in, so the scalar cost should be +; 1. +define void @h(ptr %dest, i32 %i) { +; CHECK-LABEL: SLP: Analyzing blocks in h. +; CHECK: SLP: Calculated GEPs cost for Tree: +; CHECK: SLP: Costs: +; CHECK-NEXT: SLP: ReuseShuffleCost = 0 +; CHECK-NEXT: SLP: VectorCost = 1 +; CHECK-NEXT: SLP: ScalarCost = 1 +entry: + %p1 = getelementptr [4 x i32], ptr %dest, i32 %i, i32 0 + store i32 1, ptr %p1 + %p2 = getelementptr [4 x i32], ptr %dest, i32 %i, i32 1 + store i32 1, ptr %p2 + %p3 = getelementptr [4 x i32], ptr %dest, i32 %i, i32 2 + store i32 1, ptr %p3 + %p4 = getelementptr [4 x i32], ptr %dest, i32 %i, i32 3 + store i32 1, ptr %p4 + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll @@ -2,7 +2,8 @@ ; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \ ; RUN: -riscv-v-slp-max-vf=0 -S | FileCheck %s -; FIXME: This should not be vectorized +; This shouldn't be vectorized as the extra address computation required for the +; vector instructions make it unprofitable %struct.2i32 = type { i32, i32 } @@ -10,7 +11,9 @@ ; CHECK-LABEL: @splat_store_v2i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = getelementptr [[STRUCT_2I32:%.*]], ptr [[DEST:%.*]], i64 [[I:%.*]], i32 0 -; CHECK-NEXT: store <2 x i32> , ptr [[P1]], align 4 +; CHECK-NEXT: store i32 1, ptr [[P1]], align 4 +; CHECK-NEXT: [[P2:%.*]] = getelementptr [[STRUCT_2I32]], ptr [[DEST]], i64 [[I]], i32 1 +; CHECK-NEXT: store i32 1, ptr [[P2]], align 4 ; CHECK-NEXT: ret void ; entry: