Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -23,6 +23,8 @@ #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H #include "llvm/ADT/Optional.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Operator.h" @@ -601,14 +603,36 @@ /// split during legalization. Zero is returned when the answer is unknown. unsigned getNumberOfParts(Type *Tp) const; + /// \brief Provides memory access (pointer stride) information for evaluation + /// of address-computation cost + struct AddressAccessInfo { + bool isStrided; /// True in case the access is strided (AddRec). + /// False otherwise. + const SCEV *Step; /// Holds a pointer to step stride recurrence. + AddressAccessInfo() : isStrided(false), Step(nullptr) { } + bool isStridedAccess() const { return isStrided; } + bool isConstantStridedAccess() const { + return (isStrided && dyn_cast(Step)); + } + bool isConstantStridedAccessLessThan(int64_t MergeDistance) const { + if (!isConstantStridedAccess()) + return false; + APInt StrideVal = dyn_cast(Step)->getAPInt(); + if (StrideVal.getBitWidth() > 64) + return false; + return StrideVal.getSExtValue() < MergeDistance; + } + }; + /// \returns The cost of the address computation. For most targets this can be /// merged into the instruction indexing mode. Some targets might want to /// distinguish between address computation for memory operations on vector /// types and scalar types. Such targets should override this function. - /// The 'IsComplex' parameter is a hint that the address computation is likely - /// to involve multiple instructions and as such unlikely to be merged into - /// the address indexing mode. - int getAddressComputationCost(Type *Ty, bool IsComplex = false) const; + /// The 'AddressInfo' parameter holds some information about the address + /// access so the target can decide if it requires some extra computation + /// or it's likely to be merged into the address indexing modes. + int getAddressComputationCost(Type *Ty, + AddressAccessInfo *AddressInfo = nullptr) const; /// \returns The cost, if any, of keeping values of the given types alive /// over a callsite. @@ -787,7 +811,8 @@ virtual int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; - virtual int getAddressComputationCost(Type *Ty, bool IsComplex) = 0; + virtual int getAddressComputationCost(Type *Ty, + AddressAccessInfo *AddressInfo) = 0; virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) = 0; virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) = 0; @@ -1036,8 +1061,9 @@ unsigned getNumberOfParts(Type *Tp) override { return Impl.getNumberOfParts(Tp); } - int getAddressComputationCost(Type *Ty, bool IsComplex) override { - return Impl.getAddressComputationCost(Ty, IsComplex); + int getAddressComputationCost(Type *Ty, + AddressAccessInfo *AddressInfo) override { + return Impl.getAddressComputationCost(Ty, AddressInfo); } unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) override { return Impl.getCostOfKeepingLiveOverCall(Tys); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -370,7 +370,9 @@ unsigned getNumberOfParts(Type *Tp) { return 0; } - unsigned getAddressComputationCost(Type *Tp, bool) { return 0; } + unsigned getAddressComputationCost(Type *Tp, TTI::AddressAccessInfo *) { + return 0; + } unsigned getReductionCost(unsigned, Type *, bool) { return 1; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -923,7 +923,9 @@ return LT.first; } - unsigned getAddressComputationCost(Type *Ty, bool IsComplex) { return 0; } + unsigned getAddressComputationCost(Type *Ty, TTI::AddressAccessInfo *) { + return 0; + } unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) { assert(Ty->isVectorTy() && "Expect a vector type"); Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -389,8 +389,8 @@ } int TargetTransformInfo::getAddressComputationCost(Type *Tp, - bool IsComplex) const { - int Cost = TTIImpl->getAddressComputationCost(Tp, IsComplex); + AddressAccessInfo *AddressInfo) const { + int Cost = TTIImpl->getAddressComputationCost(Tp, AddressInfo); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -104,7 +104,7 @@ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - int getAddressComputationCost(Type *Ty, bool IsComplex); + int getAddressComputationCost(Type *Ty, TTI::AddressAccessInfo *AddressInfo); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -417,14 +417,18 @@ } } -int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int AArch64TTIImpl::getAddressComputationCost(Type *Ty, + TTI::AddressAccessInfo *AddressInfo) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; + int MaxMergeDistance = 64; - if (Ty->isVectorTy() && IsComplex) + if (Ty->isVectorTy() && + AddressInfo && + !AddressInfo->isConstantStridedAccessLessThan(MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; // In many cases the address computation is not merged into the instruction Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -104,7 +104,7 @@ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - int getAddressComputationCost(Type *Val, bool IsComplex); + int getAddressComputationCost(Type *Val, TTI::AddressAccessInfo *AddressInfo); int getFPOpCost(Type *Ty); Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -338,14 +338,18 @@ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int ARMTTIImpl::getAddressComputationCost(Type *Ty, + TTI::AddressAccessInfo *AddressInfo) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; + int MaxMergeDistance = 64; - if (Ty->isVectorTy() && IsComplex) + if (Ty->isVectorTy() && + AddressInfo && + !AddressInfo->isConstantStridedAccessLessThan(MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; // In many cases the address computation is not merged into the instruction Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -71,7 +71,8 @@ unsigned AddressSpace); int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment); - int getAddressComputationCost(Type *PtrTy, bool IsComplex); + int getAddressComputationCost(Type *PtrTy, + TTI::AddressAccessInfo *AddressInfo); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1563,17 +1563,29 @@ return Cost+LT.first; } -int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int X86TTIImpl::getAddressComputationCost(Type *Ty, + TTI::AddressAccessInfo *AddressInfo) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; - if (Ty->isVectorTy() && IsComplex) - return NumVectorInstToHideOverhead; + // Cost modeling of Strided Access Computation is hidden by the indexing + // modes of X86 regardless of the stride value. We dont believe that there + // is a difference between constant strided access in gerenal and constant + // strided value which is less than or equal to 64. + // Even in the case of (loop invariant) stride whose value is not known at + // compile time, the address computation will not incur more than one extra + // ADD instruction. + if (Ty->isVectorTy() && AddressInfo) { + if (!AddressInfo->isStridedAccess()) + return NumVectorInstToHideOverhead; + if (!AddressInfo->isConstantStridedAccess()) + return 1; + } - return BaseT::getAddressComputationCost(Ty, IsComplex); + return BaseT::getAddressComputationCost(Ty, AddressInfo); } int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6774,22 +6774,21 @@ return Cost; } -/// \brief Check whether the address computation for a non-consecutive memory -/// access looks like an unlikely candidate for being merged into the indexing -/// mode. +/// \brief Gets Address Access Complexity Information like if it's strided +/// or not and in case if it's a constant stride it gets the stride value. /// -/// We look for a GEP which has one index that is an induction variable and all -/// other indices are loop invariant. If the stride of this access is also -/// within a small bound we decide that this address computation can likely be -/// merged into the addressing mode. -/// In all other cases, we identify the address computation as complex. -static bool isLikelyComplexAddressComputation(Value *Ptr, - LoopVectorizationLegality *Legal, - ScalarEvolution *SE, - const Loop *TheLoop) { +/// This Info can be sent to the Target in order to estimate the address +/// calculation cost. +static TargetTransformInfo::AddressAccessInfo getAddressAccessComplexity( + Value *Ptr, + LoopVectorizationLegality *Legal, + ScalarEvolution *SE, + const Loop *TheLoop) { + TargetTransformInfo::AddressAccessInfo AddressInfo; + auto *Gep = dyn_cast(Ptr); if (!Gep) - return true; + return AddressInfo; // We are looking for a gep with all loop invariant indices except for one // which should be an induction variable. @@ -6798,33 +6797,21 @@ Value *Opd = Gep->getOperand(i); if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && !Legal->isInductionVariable(Opd)) - return true; + return AddressInfo; } // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step // can likely be merged into the address computation. - unsigned MaxMergeDistance = 64; const SCEVAddRecExpr *AddRec = dyn_cast(SE->getSCEV(Ptr)); if (!AddRec) - return true; - - // Check the step is constant. - const SCEV *Step = AddRec->getStepRecurrence(*SE); - // Calculate the pointer stride and check if it is consecutive. - const auto *C = dyn_cast(Step); - if (!C) - return true; - - const APInt &APStepVal = C->getAPInt(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return true; + return AddressInfo; - int64_t StepVal = APStepVal.getSExtValue(); + AddressInfo.isStrided = true; - return StepVal > MaxMergeDistance; + // Get Step Recurrence. + AddressInfo.Step = AddRec->getStepRecurrence(*SE); + return AddressInfo; } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { @@ -7052,12 +7039,13 @@ unsigned Cost = 0; Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - // True if the memory instruction's address computation is complex. - bool IsComplexComputation = - isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); + // Figure out whether the access is strided and get the stride value + // if it's known in compile time + TargetTransformInfo::AddressAccessInfo AddressInfo = + getAddressAccessComplexity(Ptr, Legal, SE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); + Cost += VF * TTI.getAddressComputationCost(PtrTy, &AddressInfo); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS);