diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -320,9 +320,11 @@ /// Estimate the cost of a chain of pointers (typically pointer operands of a /// chain of loads or stores within same block) operations set when lowered. + /// \p AccessTy is the type of the loads/stores that will ultimately use the + /// \p Ptrs. InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, - const PointersChainInfo &Info, + const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind = TTI::TCK_RecipThroughput ) const; @@ -1663,7 +1665,7 @@ TTI::TargetCostKind CostKind) = 0; virtual InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, - const TTI::PointersChainInfo &Info, + const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) = 0; virtual unsigned getInliningThresholdMultiplier() = 0; virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0; @@ -2024,8 +2026,9 @@ InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, const PointersChainInfo &Info, + Type *AccessTy, TargetCostKind CostKind) override { - return Impl.getPointersChainCost(Ptrs, Base, Info, CostKind); + return Impl.getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); } unsigned getInliningThresholdMultiplier() override { return Impl.getInliningThresholdMultiplier(); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1041,6 +1041,7 @@ InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, + Type *AccessTy, TTI::TargetCostKind CostKind) { InstructionCost Cost = TTI::TCC_Free; // In the basic model we take into account GEP instructions only diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -230,10 +230,11 @@ InstructionCost TargetTransformInfo::getPointersChainCost( ArrayRef Ptrs, const Value *Base, - const TTI::PointersChainInfo &Info, TTI::TargetCostKind CostKind) const { + const TTI::PointersChainInfo &Info, Type *AccessTy, + TTI::TargetCostKind CostKind) const { assert((Base || !Info.isSameBase()) && "If pointers have same base address it has to be provided."); - return TTIImpl->getPointersChainCost(Ptrs, Base, Info, CostKind); + return TTIImpl->getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); } unsigned TargetTransformInfo::getEstimatedNumberOfCaseClusters( diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -106,6 +106,12 @@ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind); + InstructionCost getPointersChainCost(ArrayRef Ptrs, + const Value *Base, + const TTI::PointersChainInfo &Info, + Type *AccessTy, + TTI::TargetCostKind CostKind); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1592,6 +1592,55 @@ } } +// TODO: Deduplicate from TargetTransformInfoImplCRTPBase. +InstructionCost RISCVTTIImpl::getPointersChainCost( + ArrayRef Ptrs, const Value *Base, + const TTI::PointersChainInfo &Info, Type *AccessTy, + TTI::TargetCostKind CostKind) { + InstructionCost Cost = TTI::TCC_Free; + // In the basic model we take into account GEP instructions only + // (although here can come alloca instruction, a value, constants and/or + // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a + // pointer). Typically, if Base is a not a GEP-instruction and all the + // pointers are relative to the same base address, all the rest are + // either GEP instructions, PHIs, bitcasts or constants. When we have same + // base, we just calculate cost of each non-Base GEP as an ADD operation if + // any their index is a non-const. + // If no known dependecies between the pointers cost is calculated as a sum + // of costs of GEP instructions. + for (auto [I, V] : enumerate(Ptrs)) { + const auto *GEP = dyn_cast(V); + if (!GEP) + continue; + if (Info.isSameBase() && V != Base) { + if (GEP->hasAllConstantIndices()) + continue; + // If the chain is unit-stride and BaseReg + stride*i is a legal + // addressing mode, then presume the base GEP is sitting around in a + // register somewhere and check if we can fold the offset relative to + // it. + unsigned Stride = DL.getTypeStoreSize(AccessTy); + if (Info.isUnitStride() && + isLegalAddressingMode(AccessTy, + /* BaseGV */ nullptr, + /* BaseOffset */ Stride * I, + /* HasBaseReg */ true, + /* Scale */ 0, + GEP->getType()->getPointerAddressSpace())) + continue; + Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, + {TTI::OK_AnyValue, TTI::OP_None}, + std::nullopt); + } else { + SmallVector Indices(GEP->indices()); + Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), + Indices, CostKind); + } + } + return Cost; +} + void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -180,6 +180,7 @@ InstructionCost getPointersChainCost(ArrayRef Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, + Type *AccessTy, TTI::TargetCostKind CostKind); InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4943,9 +4943,11 @@ return Cost + LT.first; } -InstructionCost X86TTIImpl::getPointersChainCost( - ArrayRef Ptrs, const Value *Base, - const TTI::PointersChainInfo &Info, TTI::TargetCostKind CostKind) { +InstructionCost +X86TTIImpl::getPointersChainCost(ArrayRef Ptrs, + const Value *Base, + const TTI::PointersChainInfo &Info, + Type *AccessTy, TTI::TargetCostKind CostKind) { if (Info.isSameBase() && Info.isKnownStride()) { // If all the pointers have known stride all the differences are translated // into constants. X86 memory addressing allows encoding it into @@ -4957,7 +4959,7 @@ } return TTI::TCC_Free; } - return BaseT::getPointersChainCost(Ptrs, Base, Info, CostKind); + return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); } InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7394,7 +7394,8 @@ // stay in vectorized code due to uses outside of these scalar // loads/stores. ScalarCost = TTI->getPointersChainCost( - Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), CostKind); + Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy, + CostKind); SmallVector PtrsRetainedInVecCode; for (Value *V : Ptrs) { @@ -7420,7 +7421,7 @@ } VecCost = TTI->getPointersChainCost( PtrsRetainedInVecCode, BasePtr, - TTI::PointersChainInfo::getKnownStride(), CostKind); + TTI::PointersChainInfo::getKnownStride(), VecTy, CostKind); } else { // Case 1: Ptrs are the arguments of loads that we are going to transform // into masked gather load intrinsic. @@ -7436,7 +7437,8 @@ ? TTI::PointersChainInfo::getUnknownStride() : TTI::PointersChainInfo::getKnownStride(); - ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, CostKind); + ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, + CostKind); // Remark: it not quite correct to use scalar GEP cost for a vector GEP, // but it's not clear how to do that without having vector GEP arguments diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/getpointerschaincost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/getpointerschaincost.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/getpointerschaincost.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/getpointerschaincost.ll @@ -69,7 +69,7 @@ ret void } -; FIXME: When computing the scalar pointers chain cost here, there is a cost of +; When computing the scalar pointers chain cost here, there is a cost of ; 1 for the base pointer, and the rest can be folded in, so the scalar cost ; should be 1. define void @h(ptr %dest, i32 %i) { @@ -86,7 +86,7 @@ ; YAML-NEXT: Function: h ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-5' +; YAML-NEXT: - Cost: '-2' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '2' %p1 = getelementptr [4 x i32], ptr %dest, i32 %i, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/struct-gep.ll @@ -2,7 +2,9 @@ ; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \ ; RUN: -riscv-v-slp-max-vf=0 -S | FileCheck %s -; FIXME: This should not be vectorized +; This shouldn't be vectorized as the extra address computation required for the +; vector store make it unprofitable (vle/vse don't have an offset in their +; addressing modes) %struct.2i32 = type { i32, i32 } @@ -10,7 +12,9 @@ ; CHECK-LABEL: @splat_store_v2i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = getelementptr [[STRUCT_2I32:%.*]], ptr [[DEST:%.*]], i64 [[I:%.*]], i32 0 -; CHECK-NEXT: store <2 x i32> , ptr [[P1]], align 4 +; CHECK-NEXT: store i32 1, ptr [[P1]], align 4 +; CHECK-NEXT: [[P2:%.*]] = getelementptr [[STRUCT_2I32]], ptr [[DEST]], i64 [[I]], i32 1 +; CHECK-NEXT: store i32 1, ptr [[P2]], align 4 ; CHECK-NEXT: ret void ; entry: