Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -36,6 +36,8 @@ class Function; class GlobalValue; class Loop; +class ScalarEvolution; +class SCEV; class Type; class User; class Value; @@ -605,10 +607,11 @@ /// merged into the instruction indexing mode. Some targets might want to /// distinguish between address computation for memory operations on vector /// types and scalar types. Such targets should override this function. - /// The 'IsComplex' parameter is a hint that the address computation is likely - /// to involve multiple instructions and as such unlikely to be merged into - /// the address indexing mode. - int getAddressComputationCost(Type *Ty, bool IsComplex = false) const; + /// The 'SE' parameter holds pointer for the scalar evolution object which + /// is used in order to get the Ptr step value in case of constant stride. + /// The 'Ptr' parameter holds SCEV of the access pointer. + int getAddressComputationCost(Type *Ty, ScalarEvolution* SE = nullptr, + const SCEV *Ptr = nullptr) const; /// \returns The cost, if any, of keeping values of the given types alive /// over a callsite. @@ -787,7 +790,8 @@ virtual int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; - virtual int getAddressComputationCost(Type *Ty, bool IsComplex) = 0; + virtual int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) = 0; virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) = 0; virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) = 0; @@ -1036,8 +1040,9 @@ unsigned getNumberOfParts(Type *Tp) override { return Impl.getNumberOfParts(Tp); } - int getAddressComputationCost(Type *Ty, bool IsComplex) override { - return Impl.getAddressComputationCost(Ty, IsComplex); + int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) override { + return Impl.getAddressComputationCost(Ty, SE, Ptr); } unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) override { return Impl.getCostOfKeepingLiveOverCall(Tys); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -15,6 +15,7 @@ #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFOIMPL_H #define LLVM_ANALYSIS_TARGETTRANSFORMINFOIMPL_H +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" @@ -370,7 +371,10 @@ unsigned getNumberOfParts(Type *Tp) { return 0; } - unsigned getAddressComputationCost(Type *Tp, bool) { return 0; } + unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *, + const SCEV *) { + return 0; + } unsigned getReductionCost(unsigned, Type *, bool) { return 1; } @@ -422,6 +426,29 @@ VectorType *VecTy) const { return VF; } +protected: + bool isStridedAccess(const SCEV *Ptr) { + return Ptr && isa(Ptr); + } + + const SCEVConstant *getConstantStrideStep(ScalarEvolution *SE, + const SCEV *Ptr) { + if (!isStridedAccess(Ptr)) + return nullptr; + const SCEVAddRecExpr *AddRec = cast(Ptr); + return dyn_cast(AddRec->getStepRecurrence(*SE)); + } + + bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, + int64_t MergeDistance) { + const SCEVConstant *Step = getConstantStrideStep(SE, Ptr); + if (!Step) + return false; + APInt StrideVal = Step->getAPInt(); + if (StrideVal.getBitWidth() > 64) + return false; + return StrideVal.getSExtValue() < MergeDistance; + } }; /// \brief CRTP base class for use as a mix-in that aids implementing Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -923,7 +923,10 @@ return LT.first; } - unsigned getAddressComputationCost(Type *Ty, bool IsComplex) { return 0; } + unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, + const SCEV *) { + return 0; + } unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) { assert(Ty->isVectorTy() && "Expect a vector type"); Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -389,8 +389,9 @@ } int TargetTransformInfo::getAddressComputationCost(Type *Tp, - bool IsComplex) const { - int Cost = TTIImpl->getAddressComputationCost(Tp, IsComplex); + ScalarEvolution *SE, + const SCEV *Ptr) const { + int Cost = TTIImpl->getAddressComputationCost(Tp, SE, Ptr); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -104,7 +104,7 @@ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - int getAddressComputationCost(Type *Ty, bool IsComplex); + int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -417,14 +417,17 @@ } } -int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; + int MaxMergeDistance = 64; - if (Ty->isVectorTy() && IsComplex) + if (Ty->isVectorTy() && SE && + !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; // In many cases the address computation is not merged into the instruction Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -104,7 +104,8 @@ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - int getAddressComputationCost(Type *Val, bool IsComplex); + int getAddressComputationCost(Type *Val, ScalarEvolution *SE, + const SCEV *Ptr); int getFPOpCost(Type *Ty); Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -338,14 +338,17 @@ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; + int MaxMergeDistance = 64; - if (Ty->isVectorTy() && IsComplex) + if (Ty->isVectorTy() && SE && + !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; // In many cases the address computation is not merged into the instruction Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -71,7 +71,8 @@ unsigned AddressSpace); int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment); - int getAddressComputationCost(Type *PtrTy, bool IsComplex); + int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *Ptr); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1563,17 +1563,29 @@ return Cost+LT.first; } -int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; - if (Ty->isVectorTy() && IsComplex) - return NumVectorInstToHideOverhead; + // Cost modeling of Strided Access Computation is hidden by the indexing + // modes of X86 regardless of the stride value. We dont believe that there + // is a difference between constant strided access in gerenal and constant + // strided value which is less than or equal to 64. + // Even in the case of (loop invariant) stride whose value is not known at + // compile time, the address computation will not incur more than one extra + // ADD instruction. + if (Ty->isVectorTy() && SE) { + if (!BaseT::isStridedAccess(Ptr)) + return NumVectorInstToHideOverhead; + if (!BaseT::getConstantStrideStep(SE, Ptr)) + return 1; + } - return BaseT::getAddressComputationCost(Ty, IsComplex); + return BaseT::getAddressComputationCost(Ty, SE, Ptr); } int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6785,22 +6785,19 @@ return Cost; } -/// \brief Check whether the address computation for a non-consecutive memory -/// access looks like an unlikely candidate for being merged into the indexing -/// mode. +/// \brief Gets Address Access SCEV after verifying that the access pattern +/// is loop invariant except the induction variable dependence. /// -/// We look for a GEP which has one index that is an induction variable and all -/// other indices are loop invariant. If the stride of this access is also -/// within a small bound we decide that this address computation can likely be -/// merged into the addressing mode. -/// In all other cases, we identify the address computation as complex. -static bool isLikelyComplexAddressComputation(Value *Ptr, - LoopVectorizationLegality *Legal, - ScalarEvolution *SE, - const Loop *TheLoop) { +/// This SCEV can be sent to the Target in order to estimate the address +/// calculation cost. +static const SCEV *getAddressAccessSCEV( + Value *Ptr, + LoopVectorizationLegality *Legal, + ScalarEvolution *SE, + const Loop *TheLoop) { auto *Gep = dyn_cast(Ptr); if (!Gep) - return true; + return nullptr; // We are looking for a gep with all loop invariant indices except for one // which should be an induction variable. @@ -6809,33 +6806,11 @@ Value *Opd = Gep->getOperand(i); if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && !Legal->isInductionVariable(Opd)) - return true; + return nullptr; } - // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step - // can likely be merged into the address computation. - unsigned MaxMergeDistance = 64; - - const SCEVAddRecExpr *AddRec = dyn_cast(SE->getSCEV(Ptr)); - if (!AddRec) - return true; - - // Check the step is constant. - const SCEV *Step = AddRec->getStepRecurrence(*SE); - // Calculate the pointer stride and check if it is consecutive. - const auto *C = dyn_cast(Step); - if (!C) - return true; - - const APInt &APStepVal = C->getAPInt(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return true; - - int64_t StepVal = APStepVal.getSExtValue(); - - return StepVal > MaxMergeDistance; + // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. + return SE->getSCEV(Ptr); } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { @@ -7063,12 +7038,12 @@ unsigned Cost = 0; Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - // True if the memory instruction's address computation is complex. - bool IsComplexComputation = - isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); + // Figure out whether the access is strided and get the stride value + // if it's known in compile time + const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); + Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); Index: test/Transforms/LoopVectorize/X86/strided_load_cost.ll =================================================================== --- test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -0,0 +1,54 @@ +; This test checks that the given loop still beneficial for vecotization +; even if it contains scalarized load (gather on AVX2) +;RUN: opt < %s -loop-vectorize -S -o - | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind readonly uwtable +define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 { +entry: + %idxprom = sext i32 %i to i64 + %idxprom5 = sext i32 %j to i64 + br label %for.body + + for.cond.cleanup: ; preds = %for.body + ret i32 %add7 + + for.body: ; preds = %for.body, %entry + ; the loop gets vectorized + ; first consecutive load as vector load + ; CHECK: %wide.load = load <8 x i32> + ; second strided load scalarized + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ] + %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %idxprom, i64 %indvars.iv + %0 = load i32, i32* %arrayidx2, align 4, !tbaa !1 + %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %indvars.iv, i64 %idxprom5 + %1 = load i32, i32* %arrayidx6, align 4, !tbaa !1 + %mul = mul nsw i32 %1, %0 + %add = add i32 %sum.015, 4 + %add7 = add i32 %add, %mul + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 4.0.0 (cfe/trunk 284570)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"}