Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -601,14 +601,26 @@ /// split during legalization. Zero is returned when the answer is unknown. unsigned getNumberOfParts(Type *Tp) const; + /// \brief Provides memory access (pointer stride) information for evaluation + /// of address-computation cost + struct AddressAccessInfo { + bool isStrided; /// True in case the access is strided (AddRec). + /// False otherwise. + bool isConstantStride; /// True in case the stride is known at compile + /// time. + APInt StrideVal; /// Holds the stride value in bytes when + /// isConstantStride is true. undef otherwise. + }; + /// \returns The cost of the address computation. For most targets this can be /// merged into the instruction indexing mode. Some targets might want to /// distinguish between address computation for memory operations on vector /// types and scalar types. Such targets should override this function. - /// The 'IsComplex' parameter is a hint that the address computation is likely - /// to involve multiple instructions and as such unlikely to be merged into - /// the address indexing mode. - int getAddressComputationCost(Type *Ty, bool IsComplex = false) const; + /// The 'Info' parameter holds some information about the address access + /// so the target can decide if it requires some extra computation or it's + /// likely to be merged into the address indexing modes. + int getAddressComputationCost(Type *Ty, + AddressAccessInfo *Info = nullptr) const; /// \returns The cost, if any, of keeping values of the given types alive /// over a callsite. @@ -787,7 +799,7 @@ virtual int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; - virtual int getAddressComputationCost(Type *Ty, bool IsComplex) = 0; + virtual int getAddressComputationCost(Type *Ty, AddressAccessInfo *Info) = 0; virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) = 0; virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) = 0; @@ -810,6 +822,12 @@ virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; +protected: + /// Determines if the given address computation will be hidden into the + /// addressing mode of the target, or else will require extra computation + /// + /// \p Info hold a pointer to the access information + virtual bool isExpensiveAddressComputation(AddressAccessInfo *Info) = 0; }; template @@ -1036,8 +1054,8 @@ unsigned getNumberOfParts(Type *Tp) override { return Impl.getNumberOfParts(Tp); } - int getAddressComputationCost(Type *Ty, bool IsComplex) override { - return Impl.getAddressComputationCost(Ty, IsComplex); + int getAddressComputationCost(Type *Ty, AddressAccessInfo *Info) override { + return Impl.getAddressComputationCost(Ty, Info); } unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) override { return Impl.getCostOfKeepingLiveOverCall(Tys); @@ -1085,6 +1103,10 @@ VectorType *VecTy) const override { return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } +protected: + bool isExpensiveAddressComputation(AddressAccessInfo *Info) override { + return Impl.isExpensiveAddressComputation(Info); + } }; template Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -368,9 +368,15 @@ return 1; } + bool isExpensiveAddressComputation(TTI::AddressAccessInfo *Info) { + return false; + } + unsigned getNumberOfParts(Type *Tp) { return 0; } - unsigned getAddressComputationCost(Type *Tp, bool) { return 0; } + unsigned getAddressComputationCost(Type *Tp, TTI::AddressAccessInfo*) { + return 0; + } unsigned getReductionCost(unsigned, Type *, bool) { return 1; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -918,12 +918,31 @@ return 10; } + bool isExpensiveAddressComputation(TTI::AddressAccessInfo *Info) { + int MaxMergeDistance = 64; + + if (!Info) + return false; + + if (!Info->isStrided || !Info->isConstantStride) + return true; + + // Huge step value - give up. + if (Info->StrideVal.getBitWidth() > 64) + return true; + + int64_t StepVal = Info->StrideVal.getSExtValue(); + return (StepVal > MaxMergeDistance); + } + unsigned getNumberOfParts(Type *Tp) { std::pair LT = getTLI()->getTypeLegalizationCost(DL, Tp); return LT.first; } - unsigned getAddressComputationCost(Type *Ty, bool IsComplex) { return 0; } + unsigned getAddressComputationCost(Type *Ty, TTI::AddressAccessInfo*) { + return 0; + } unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) { assert(Ty->isVectorTy() && "Expect a vector type"); Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -389,8 +389,8 @@ } int TargetTransformInfo::getAddressComputationCost(Type *Tp, - bool IsComplex) const { - int Cost = TTIImpl->getAddressComputationCost(Tp, IsComplex); + AddressAccessInfo *Info) const { + int Cost = TTIImpl->getAddressComputationCost(Tp, Info); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -104,7 +104,7 @@ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - int getAddressComputationCost(Type *Ty, bool IsComplex); + int getAddressComputationCost(Type *Ty, TTI::AddressAccessInfo *Info); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -417,14 +417,15 @@ } } -int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int AArch64TTIImpl::getAddressComputationCost(Type *Ty, + TTI::AddressAccessInfo *Info) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; - if (Ty->isVectorTy() && IsComplex) + if (Ty->isVectorTy() && BaseT::isExpensiveAddressComputation(Info)) return NumVectorInstToHideOverhead; // In many cases the address computation is not merged into the instruction Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -104,7 +104,7 @@ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - int getAddressComputationCost(Type *Val, bool IsComplex); + int getAddressComputationCost(Type *Val, TTI::AddressAccessInfo *Info); int getFPOpCost(Type *Ty); Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -338,14 +338,15 @@ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int ARMTTIImpl::getAddressComputationCost(Type *Ty, + TTI::AddressAccessInfo *Info) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; - if (Ty->isVectorTy() && IsComplex) + if (Ty->isVectorTy() && BaseT::isExpensiveAddressComputation(Info)) return NumVectorInstToHideOverhead; // In many cases the address computation is not merged into the instruction Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -71,7 +71,7 @@ unsigned AddressSpace); int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment); - int getAddressComputationCost(Type *PtrTy, bool IsComplex); + int getAddressComputationCost(Type *PtrTy, TTI::AddressAccessInfo *Info); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF); @@ -95,6 +95,7 @@ const Function *Callee) const; bool enableInterleavedAccessVectorization(); + bool isExpensiveAddressComputation(TTI::AddressAccessInfo *Info); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, unsigned Alignment, unsigned AddressSpace); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1473,17 +1473,29 @@ return Cost+LT.first; } -int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +bool X86TTIImpl::isExpensiveAddressComputation(TTI::AddressAccessInfo *Info) { + // Cost modeling of Strided Address Computation is hidden by the indexing + // modes of X86 regardless of the stride value. We dont beleive that there + // is a difference between strided access in gerenal from constant strided + // value which less equal than 64. Same in the case of non-compile time + // constant stride. + if (Info && !Info->isStrided) + return true; + return false; +} + +int X86TTIImpl::getAddressComputationCost(Type *Ty, + TTI::AddressAccessInfo *Info) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; - if (Ty->isVectorTy() && IsComplex) + if (Ty->isVectorTy() && isExpensiveAddressComputation(Info)) return NumVectorInstToHideOverhead; - return BaseT::getAddressComputationCost(Ty, IsComplex); + return BaseT::getAddressComputationCost(Ty, Info); } int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6769,23 +6769,22 @@ return Cost; } -/// \brief Check whether the address computation for a non-consecutive memory -/// access looks like an unlikely candidate for being merged into the indexing -/// mode. +/// \brief Gets Address Access Complexity Information like if it's strided +/// or not and in case if it's a constant stride it gets the stride value. /// -/// We look for a GEP which has one index that is an induction variable and all -/// other indices are loop invariant. If the stride of this access is also -/// within a small bound we decide that this address computation can likely be -/// merged into the addressing mode. -/// In all other cases, we identify the address computation as complex. -static bool isLikelyComplexAddressComputation(Value *Ptr, - LoopVectorizationLegality *Legal, - ScalarEvolution *SE, - const Loop *TheLoop) { +/// This Info can be sent to the Target in order to estimate the address +/// calculation cost. +static TargetTransformInfo::AddressAccessInfo getAddressAccessComplexity( + Value *Ptr, + LoopVectorizationLegality *Legal, + ScalarEvolution *SE, + const Loop *TheLoop) { + TargetTransformInfo::AddressAccessInfo Info; + Info.isStrided = false; auto *Gep = dyn_cast(Ptr); if (!Gep) - return true; - + return Info; + // We are looking for a gep with all loop invariant indices except for one // which should be an induction variable. unsigned NumOperands = Gep->getNumOperands(); @@ -6793,35 +6792,31 @@ Value *Opd = Gep->getOperand(i); if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && !Legal->isInductionVariable(Opd)) - return true; + return Info; } // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step // can likely be merged into the address computation. - unsigned MaxMergeDistance = 64; - const SCEVAddRecExpr *AddRec = dyn_cast(SE->getSCEV(Ptr)); if (!AddRec) - return true; + return Info; + + Info.isStrided = true; // Check the step is constant. const SCEV *Step = AddRec->getStepRecurrence(*SE); // Calculate the pointer stride and check if it is consecutive. const auto *C = dyn_cast(Step); if (!C) - return true; - - const APInt &APStepVal = C->getAPInt(); + return Info; - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return true; + Info.isConstantStride = true; + Info.StrideVal = C->getAPInt(); - int64_t StepVal = APStepVal.getSExtValue(); - - return StepVal > MaxMergeDistance; + return Info; } + static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { return Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1)); @@ -7048,12 +7043,12 @@ unsigned Cost = 0; Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - // True if the memory instruction's address computation is complex. - bool IsComplexComputation = - isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); + // Get some info about the address access complexity in the loop + TargetTransformInfo::AddressAccessInfo Info = + getAddressAccessComplexity(Ptr, Legal, SE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); + Cost += VF * TTI.getAddressComputationCost(PtrTy, &Info); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); Index: test/Transforms/LoopVectorize/X86/strided_load_cost.ll =================================================================== --- test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -0,0 +1,54 @@ +; This test checks that the given loop still beneficial for vecotization +; even if it contains scalarized load (gather on AVX2) +;RUN: opt < %s -loop-vectorize -S -o - | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind readonly uwtable +define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 { +entry: + %idxprom = sext i32 %i to i64 + %idxprom5 = sext i32 %j to i64 + br label %for.body + + for.cond.cleanup: ; preds = %for.body + ret i32 %add7 + + for.body: ; preds = %for.body, %entry + ; the loop gets vectorized + ; first consecutive load as vector load + ; CHECK: %wide.load = load <8 x i32> + ; second strided load scalarized + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + ; CHECK: load i32 + + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ] + %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %idxprom, i64 %indvars.iv + %0 = load i32, i32* %arrayidx2, align 4, !tbaa !1 + %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %indvars.iv, i64 %idxprom5 + %1 = load i32, i32* %arrayidx6, align 4, !tbaa !1 + %mul = mul nsw i32 %1, %0 + %add = add i32 %sum.015, 4 + %add7 = add i32 %add, %mul + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 4.0.0 (cfe/trunk 284570)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"}