Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -162,6 +162,13 @@ /// the EXT operation. int getExtCost(const Instruction *I, const Value *Src) const; + /// \brief Estimate the cost of Store operation when lowered + /// + /// The contract for this function is the same as \c getOperationCost. + /// This is the basic query to estimate the cost of Store operation in terms + /// of \c TargetCostConstants. + int getStoreCost(const Instruction *I) const; + /// \brief Estimate the cost of a function call when lowered. /// /// The contract for this is the same as \c getOperationCost except that it @@ -865,6 +872,7 @@ virtual int getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands) = 0; virtual int getExtCost(const Instruction *I, const Value *Src) = 0; + virtual int getStoreCost(const Instruction *I) = 0; virtual int getCallCost(FunctionType *FTy, int NumArgs) = 0; virtual int getCallCost(const Function *F, int NumArgs) = 0; virtual int getCallCost(const Function *F, @@ -1043,6 +1051,9 @@ int getExtCost(const Instruction *I, const Value *Src) override { return Impl.getExtCost(I, Src); } + int getStoreCost(const Instruction *I) override { + return Impl.getStoreCost(I); + } int getCallCost(FunctionType *FTy, int NumArgs) override { return Impl.getCallCost(FTy, NumArgs); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -705,6 +705,12 @@ return static_cast(this)->getIntrinsicCost(IID, RetTy, ParamTys); } + unsigned getStoreCost(const Instruction *I) { + return static_cast(this) + ->getOperationCost(Instruction::Store, I->getType(), + I->getOperand(0)->getType()); + } + unsigned getUserCost(const User *U, ArrayRef Operands) { if (isa(U)) return TTI::TCC_Free; // Model all PHI nodes as free. @@ -738,6 +744,9 @@ return static_cast(this)->getExtCost(CI, Operands.back()); } + if (isa(U)) + return static_cast(this)->getStoreCost(cast(U)); + return static_cast(this)->getOperationCost( Operator::getOpcode(U), U->getType(), U->getNumOperands() == 1 ? U->getOperand(0)->getType() : nullptr); Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -87,6 +87,10 @@ return TTIImpl->getExtCost(I, Src); } +int TargetTransformInfo::getStoreCost(const Instruction *I) const { + return TTIImpl->getStoreCost(I); +} + int TargetTransformInfo::getIntrinsicCost( Intrinsic::ID IID, Type *RetTy, ArrayRef Arguments) const { int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments); Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -101,6 +101,8 @@ int getIntImmCost(const APInt &Imm, Type *Ty); + int getStoreCost(const Instruction *I); + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2046,6 +2046,18 @@ return X86TTIImpl::getIntImmCost(Imm, Ty); } +int X86TTIImpl::getStoreCost(const Instruction *I) { + assert(isa(I) && "Expected Store Instruction in getStoreCost()"); + Value *Ptr = I->getOperand(1); + // Store operation takes 2 UOps. If the address is simple, without scale*index + // arithmetics, address calculation goes to another port. + if (auto GEP = dyn_cast(Ptr)) { + if (!all_of(GEP->indices(), [](Value *V) { return isa(V); })) + return TTI::TCC_Basic * 2; + } + return TTI::TCC_Basic; +} + // Return an average cost of Gather / Scatter instruction, maybe improved later int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, unsigned Alignment, unsigned AddressSpace) { Index: test/Transforms/LoopUnroll/X86/store_cost.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/X86/store_cost.ll @@ -0,0 +1,104 @@ +; REQUIRES: asserts +; RUN: -mcpu=core-avx2 -loop-unroll --debug-only=loop-unroll -S -unroll-allow-partial -S %s -o /dev/null + +target triple = "x86_64-unknown-linux-gnu" + +; CHECK: Loop Unroll: F[foo] Loop %loop.2.header +; CHECK: Loop Size = 27 +; CHECK-NOT: UNROLLING loop %loop.2.header +; CHECK: Loop Unroll: F[foo] Loop %loop.header +; CHECK: Loop Size = 25 +; CHECK: UNROLLING loop %loop.header by 2 + +define void @foo(i32 * %out) { +entry: + %0 = alloca [1024 x i32] + %x0 = alloca [1024 x i32] + %x01 = alloca [1024 x i32] + %x02 = alloca [1024 x i32] + %x03 = alloca [1024 x i32] + %x04 = alloca [1024 x i32] + %x05 = alloca [1024 x i32] + %x06 = alloca [1024 x i32] + br label %loop.header + +loop.header: + %counter = phi i32 [0, %entry], [%inc, %loop.inc] + br label %loop.body + +loop.body: + %ptr = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 %counter + store i32 %counter, i32* %ptr + %val = add i32 %counter, 5 + %xptr = getelementptr [1024 x i32], [1024 x i32]* %x0, i32 0, i32 %counter + store i32 %val, i32* %xptr + %val1 = add i32 %counter, 6 + %xptr1 = getelementptr [1024 x i32], [1024 x i32]* %x01, i32 0, i32 %counter + store i32 %val1, i32* %xptr1 + %val2 = add i32 %counter, 7 + %xptr2 = getelementptr [1024 x i32], [1024 x i32]* %x02, i32 0, i32 %counter + store i32 %val2, i32* %xptr2 + %val3 = add i32 %counter, 8 + %xptr3 = getelementptr [1024 x i32], [1024 x i32]* %x03, i32 0, i32 %counter + store i32 %val3, i32* %xptr3 + %val4 = add i32 %counter, 9 + %xptr4 = getelementptr [1024 x i32], [1024 x i32]* %x04, i32 0, i32 %counter + store i32 %val4, i32* %xptr4 + %val5 = add i32 %counter, 10 + %xptr5 = getelementptr [1024 x i32], [1024 x i32]* %x05, i32 0, i32 %counter + store i32 %val5, i32* %xptr5 + br label %loop.inc + +loop.inc: + %inc = add i32 %counter, 2 + %1 = icmp sge i32 %inc, 1023 + br i1 %1, label %exit.0, label %loop.header + +exit.0: + %2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 5 + %3 = load i32, i32* %2 + store i32 %3, i32 * %out + br label %loop.2.header + + +loop.2.header: + %counter.2 = phi i32 [0, %exit.0], [%inc.2, %loop.2.inc] + br label %loop.2.body + +loop.2.body: + %ptr.2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 %counter.2 + store i32 %counter.2, i32* %ptr.2 + %val.2 = add i32 %counter.2, 5 + %xptr.2 = getelementptr [1024 x i32], [1024 x i32]* %x0, i32 0, i32 %counter.2 + store i32 %val.2, i32* %xptr.2 + %val1.2 = add i32 %counter.2, 6 + %xptr1.2 = getelementptr [1024 x i32], [1024 x i32]* %x01, i32 0, i32 %counter.2 + store i32 %val1, i32* %xptr1.2 + %val2.2 = add i32 %counter.2, 7 + %xptr2.2 = getelementptr [1024 x i32], [1024 x i32]* %x02, i32 0, i32 %counter.2 + store i32 %val2, i32* %xptr2.2 + %val3.2 = add i32 %counter.2, 8 + %xptr3.2 = getelementptr [1024 x i32], [1024 x i32]* %x03, i32 0, i32 %counter.2 + store i32 %val3.2, i32* %xptr3.2 + %val4.2 = add i32 %counter.2, 9 + %xptr4.2 = getelementptr [1024 x i32], [1024 x i32]* %x04, i32 0, i32 %counter.2 + store i32 %val4.2, i32* %xptr4.2 + %val5.2 = add i32 %counter.2, 10 + %xptr5.2 = getelementptr [1024 x i32], [1024 x i32]* %x05, i32 0, i32 %counter.2 + store i32 %val5.2, i32* %xptr5.2 + %xptr6.2 = getelementptr [1024 x i32], [1024 x i32]* %x06, i32 0, i32 %counter.2 + store i32 %val5.2, i32* %xptr6.2 + br label %loop.2.inc + +loop.2.inc: + %inc.2 = add i32 %counter.2, 2 + %4 = icmp sge i32 %inc.2, 1023 + br i1 %4, label %exit.2, label %loop.2.header + +exit.2: + %x2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 6 + %x3 = load i32, i32* %x2 + %out2 = getelementptr i32, i32 * %out, i32 1 + store i32 %3, i32 * %out2 + ret void +}