Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -136,7 +136,8 @@ /// The contract for this function is the same as \c getOperationCost except /// that it supports an interface that provides extra information specific to /// the GEP operation. - unsigned getGEPCost(const Value *Ptr, ArrayRef Operands) const; + unsigned getGEPCost(Type *PointeeType, const Value *Ptr, + ArrayRef Operands) const; /// \brief Estimate the cost of a function call when lowered. /// @@ -522,7 +523,7 @@ virtual ~Concept() = 0; virtual unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) = 0; - virtual unsigned getGEPCost(const Value *Ptr, + virtual unsigned getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands) = 0; virtual unsigned getCallCost(FunctionType *FTy, int NumArgs) = 0; virtual unsigned getCallCost(const Function *F, int NumArgs) = 0; @@ -610,9 +611,9 @@ unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) override { return Impl.getOperationCost(Opcode, Ty, OpTy); } - unsigned getGEPCost(const Value *Ptr, + unsigned getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands) override { - return Impl.getGEPCost(Ptr, Operands); + return Impl.getGEPCost(PointeeType, Ptr, Operands); } unsigned getCallCost(FunctionType *FTy, int NumArgs) override { return Impl.getCallCost(FTy, NumArgs); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -19,6 +19,7 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" @@ -107,7 +108,8 @@ } } - unsigned getGEPCost(const Value *Ptr, ArrayRef Operands) { + unsigned getGEPCost(Type *PointeeType, const Value *Ptr, + ArrayRef Operands) { // In the basic model, we just assume that all-constant GEPs will be folded // into their uses via addressing modes. for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx) @@ -386,6 +388,61 @@ return static_cast(this)->getCallCost(F, Arguments.size()); } + using BaseT::getGEPCost; + + unsigned getGEPCost(Type *PointeeType, const Value *Ptr, + ArrayRef Operands0) { + const GlobalValue *BaseGV = nullptr; + if (Ptr != nullptr) { + // TODO: will remove this when a pointer can have an opaque type. + assert(Ptr->getType()->getScalarType()->getPointerElementType() == + PointeeType && + "explicit pointee type doesn't match operand's pointee type"); + BaseGV = dyn_cast(Ptr->stripPointerCasts()); + } + bool HasBaseReg = (BaseGV == nullptr); + int64_t BaseOffset = 0; + int64_t Scale = 0; + + // cast elements in Operands0 to non-const type + SmallVector Operands; + for (const Value *Operand : Operands0) + Operands.push_back(const_cast(Operand)); + + // The address space of the starting pointer type is unimportant; its + // address space won't be used. + generic_gep_type_iterator GTI = gep_type_begin( + PointerType::getUnqual(PointeeType), ArrayRef(Operands)); + for (auto I = Operands.begin(); I != Operands.end(); ++I, ++GTI) { + if (isa(*GTI)) { + int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType()); + if (const ConstantInt *ConstIdx = dyn_cast(*I)) { + BaseOffset += ConstIdx->getSExtValue() * ElementSize; + } else { + // Needs scale register. + if (Scale != 0) { + // No addressing mode takes two scale registers. + return TTI::TCC_Basic; + } + Scale = ElementSize; + } + } else { + StructType *STy = cast(*GTI); + uint64_t Field = cast(*I)->getZExtValue(); + BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field); + } + } + + Type *GEPReturnTy = + GetElementPtrInst::getGEPReturnType(const_cast(Ptr), Operands); + if (static_cast(this)->isLegalAddressingMode( + GEPReturnTy, const_cast(BaseGV), BaseOffset, + HasBaseReg, Scale)) { + return TTI::TCC_Free; + } + return TTI::TCC_Basic; + } + using BaseT::getIntrinsicCost; unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, @@ -405,9 +462,9 @@ return TTI::TCC_Free; // Model all PHI nodes as free. if (const GEPOperator *GEP = dyn_cast(U)) { - SmallVector Indices(GEP->idx_begin(), GEP->idx_end()); - return static_cast(this) - ->getGEPCost(GEP->getPointerOperand(), Indices); + SmallVector Indices(GEP->idx_begin(), GEP->idx_end()); + return static_cast(this)->getGEPCost( + GEP->getSourceElementType(), GEP->getPointerOperand(), Indices); } if (auto CS = ImmutableCallSite(U)) { Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -383,10 +383,8 @@ return -1; switch (I->getOpcode()) { - case Instruction::GetElementPtr:{ - Type *ValTy = I->getOperand(0)->getType()->getPointerElementType(); - return TTI->getAddressComputationCost(ValTy); - } + case Instruction::GetElementPtr: + return TTI->getUserCost(I); case Instruction::Ret: case Instruction::PHI: Index: lib/Transforms/Scalar/StraightLineStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -232,6 +232,7 @@ Basis.CandidateKind == C.CandidateKind); } +// TODO: use TTI->getGEPCost. static bool isGEPFoldable(GetElementPtrInst *GEP, const TargetTransformInfo *TTI, const DataLayout *DL) { Index: test/Analysis/CostModel/ARM/gep.ll =================================================================== --- test/Analysis/CostModel/ARM/gep.ll +++ test/Analysis/CostModel/ARM/gep.ll @@ -3,41 +3,85 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" target triple = "thumbv7-apple-ios6.0.0" -define void @test_geps() { - ; Cost of scalar integer geps should be one. We can't always expect it to be - ; folded into the instruction addressing mode. -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8* +define void @test_geps(i32 %i) { + ; GEPs with index 0 are essentially NOOPs. +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8* %a0 = getelementptr inbounds i8, i8* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16* %a1 = getelementptr inbounds i16, i16* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32* %a2 = getelementptr inbounds i32, i32* undef, i32 0 - -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64* %a3 = getelementptr inbounds i64, i64* undef, i32 0 - - ; Cost of scalar floating point geps should be one. We cannot fold the address - ; computation. -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds float, float* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds float, float* %a4 = getelementptr inbounds float, float* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds double, double* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds double, double* %a5 = getelementptr inbounds double, double* undef, i32 0 - - - ; Cost of vector geps should be one. We cannot fold the address computation. -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>* %a7 = getelementptr inbounds <4 x i8>, <4 x i8>* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>* %a8 = getelementptr inbounds <4 x i16>, <4 x i16>* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i32>, <4 x i32>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i32>, <4 x i32>* %a9 = getelementptr inbounds <4 x i32>, <4 x i32>* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i64>, <4 x i64>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i64>, <4 x i64>* %a10 = getelementptr inbounds <4 x i64>, <4 x i64>* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x float>, <4 x float>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x float>, <4 x float>* %a11 = getelementptr inbounds <4 x float>, <4 x float>* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x double>, <4 x double>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x double>, <4 x double>* %a12 = getelementptr inbounds <4 x double>, <4 x double>* undef, i32 0 + ; Cost of GEPs is one if we cannot fold the address computation. +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8* + %b0 = getelementptr inbounds i8, i8* undef, i32 1024 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16* + %b1 = getelementptr inbounds i16, i16* undef, i32 1024 + ; Thumb-2 cannot fold offset >= 2^12 into address computation. +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32* + %b2 = getelementptr inbounds i32, i32* undef, i32 1024 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64* + %b3 = getelementptr inbounds i64, i64* undef, i32 1024 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds float, float* + %b4 = getelementptr inbounds float, float* undef, i32 1024 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds double, double* + %b5 = getelementptr inbounds double, double* undef, i32 1024 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>* + %b7 = getelementptr inbounds <4 x i8>, <4 x i8>* undef, i32 1 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>* + %b8 = getelementptr inbounds <4 x i16>, <4 x i16>* undef, i32 1 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i32>, <4 x i32>* + %b9 = getelementptr inbounds <4 x i32>, <4 x i32>* undef, i32 1 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i64>, <4 x i64>* + %b10 = getelementptr inbounds <4 x i64>, <4 x i64>* undef, i32 1 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x float>, <4 x float>* + %b11 = getelementptr inbounds <4 x float>, <4 x float>* undef, i32 1 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x double>, <4 x double>* + %b12 = getelementptr inbounds <4 x double>, <4 x double>* undef, i32 1 + +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8* + %c0 = getelementptr inbounds i8, i8* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16* + %c1 = getelementptr inbounds i16, i16* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32* + %c2 = getelementptr inbounds i32, i32* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64* + %c3 = getelementptr inbounds i64, i64* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds float, float* + %c4 = getelementptr inbounds float, float* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds double, double* + %c5 = getelementptr inbounds double, double* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>* + %c7 = getelementptr inbounds <4 x i8>, <4 x i8>* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>* + %c8 = getelementptr inbounds <4 x i16>, <4 x i16>* undef, i32 %i + ; Thumb-2 cannot fold scales larger than 8 to address computation. +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i32>, <4 x i32>* + %c9 = getelementptr inbounds <4 x i32>, <4 x i32>* undef, i32 %i +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i64>, <4 x i64>* + %c10 = getelementptr inbounds <4 x i64>, <4 x i64>* undef, i32 %i +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x float>, <4 x float>* + %c11 = getelementptr inbounds <4 x float>, <4 x float>* undef, i32 %i +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x double>, <4 x double>* + %c12 = getelementptr inbounds <4 x double>, <4 x double>* undef, i32 %i ret void } Index: test/Analysis/CostModel/no_info.ll =================================================================== --- test/Analysis/CostModel/no_info.ll +++ test/Analysis/CostModel/no_info.ll @@ -5,9 +5,27 @@ ; -- No triple in this module -- -;CHECK: cost of 1 {{.*}} add -;CHECK: cost of 1 {{.*}} ret +; CHECK-LABEL: function 'no_info' +; CHECK: cost of 1 {{.*}} add +; CHECK: cost of 1 {{.*}} ret define i32 @no_info(i32 %arg) { %e = add i32 %arg, %arg ret i32 %e } + +define i8 @addressing_mode_reg_reg(i8* %a, i32 %b) { +; CHECK-LABEL: function 'addressing_mode_reg_reg' + %p = getelementptr i8, i8* %a, i32 %b ; NoTTI accepts reg+reg addressing. +; CHECK: cost of 0 {{.*}} getelementptr + %v = load i8, i8* %p + ret i8 %v +} + +; CHECK-LABEL: function 'addressing_mode_scaled_reg' +define i32 @addressing_mode_scaled_reg(i32* %a, i32 %b) { + ; NoTTI rejects reg+scale*reg addressing. + %p = getelementptr i32, i32* %a, i32 %b +; CHECK: cost of 1 {{.*}} getelementptr + %v = load i32, i32* %p + ret i32 %v +} Index: test/Transforms/LoopUnroll/X86/partial.ll =================================================================== --- test/Transforms/LoopUnroll/X86/partial.ll +++ test/Transforms/LoopUnroll/X86/partial.ll @@ -86,17 +86,20 @@ %reduction.026 = phi i16 [ %add14, %for.body ], [ 0, %entry ] %arrayidx = getelementptr inbounds i16, i16* %arr, i64 %indvars.iv %0 = load i16, i16* %arrayidx, align 2 - %add = add i16 %0, %reduction.026 + %mul = shl i16 %0, 1 + %add = add i16 %mul, %reduction.026 %sext = mul i64 %indvars.iv, 12884901888 %idxprom3 = ashr exact i64 %sext, 32 %arrayidx4 = getelementptr inbounds i16, i16* %arr, i64 %idxprom3 %1 = load i16, i16* %arrayidx4, align 2 - %add7 = add i16 %add, %1 + %mul2 = shl i16 %1, 1 + %add7 = add i16 %add, %mul2 %sext28 = mul i64 %indvars.iv, 21474836480 %idxprom10 = ashr exact i64 %sext28, 32 %arrayidx11 = getelementptr inbounds i16, i16* %arr, i64 %idxprom10 %2 = load i16, i16* %arrayidx11, align 2 - %add14 = add i16 %add7, %2 + %mul3 = shl i16 %2, 1 + %add14 = add i16 %add7, %mul3 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %n Index: test/Transforms/LoopVectorize/X86/metadata-enable.ll =================================================================== --- test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -60,7 +60,7 @@ %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv store i32 %add, i32* %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 32 + %exitcond = icmp eq i64 %indvars.iv.next, 64 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 for.end: ; preds = %for.body @@ -111,7 +111,7 @@ %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv store i32 %add, i32* %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 32 + %exitcond = icmp eq i64 %indvars.iv.next, 64 br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body @@ -162,7 +162,7 @@ %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv store i32 %add, i32* %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 32 + %exitcond = icmp eq i64 %indvars.iv.next, 64 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 for.end: ; preds = %for.body