diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1103,9 +1103,10 @@ unsigned Index = -1) const; /// \return The expected cost of control-flow related instructions such as - /// Phi, Ret, Br. + /// Phi, Ret, Br, Switch. int getCFInstrCost(unsigned Opcode, - TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const; + TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, + const Instruction *I = nullptr) const; /// \returns The expected cost of compare and select instructions. If there /// is an existing instruction that holds Opcode, it may be passed in the @@ -1573,8 +1574,8 @@ const Instruction *I) = 0; virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) = 0; - virtual int getCFInstrCost(unsigned Opcode, - TTI::TargetCostKind CostKind) = 0; + virtual int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) = 0; virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, @@ -2040,8 +2041,9 @@ unsigned Index) override { return Impl.getExtractWithExtendCost(Opcode, Dst, VecTy, Index); } - int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) override { - return Impl.getCFInstrCost(Opcode, CostKind); + int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) override { + return Impl.getCFInstrCost(Opcode, CostKind, I); } int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -512,7 +512,8 @@ return 1; } - unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) const { + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) const { // A phi would be free, unless we're costing the throughput because it // will require a register. if (Opcode == Instruction::PHI && CostKind != TTI::TCK_RecipThroughput) @@ -933,7 +934,8 @@ case Instruction::Br: case Instruction::Ret: case Instruction::PHI: - return TargetTTI->getCFInstrCost(Opcode, CostKind); + case Instruction::Switch: + return TargetTTI->getCFInstrCost(Opcode, CostKind, I); case Instruction::ExtractValue: case Instruction::Freeze: return TTI::TCC_Free; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -897,8 +897,9 @@ TTI::CastContextHint::None, TTI::TCK_RecipThroughput); } - unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { - return BaseT::getCFInstrCost(Opcode, CostKind); + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) { + return BaseT::getCFInstrCost(Opcode, CostKind, I); } unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -783,8 +783,11 @@ } int TargetTransformInfo::getCFInstrCost(unsigned Opcode, - TTI::TargetCostKind CostKind) const { - int Cost = TTIImpl->getCFInstrCost(Opcode, CostKind); + TTI::TargetCostKind CostKind, + const Instruction *I) const { + assert((I == nullptr || I->getOpcode() == Opcode) && + "Opcode should reflect passed instruction."); + int Cost = TTIImpl->getCFInstrCost(Opcode, CostKind, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } @@ -1374,6 +1377,7 @@ case Instruction::ExtractValue: case Instruction::ShuffleVector: case Instruction::Call: + case Instruction::Switch: return getUserCost(I, CostKind); default: // We don't have any information on this instruction. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -139,7 +139,8 @@ int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index); - unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -653,7 +653,8 @@ } unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode, - TTI::TargetCostKind CostKind) { + TTI::TargetCostKind CostKind, + const Instruction *I) { if (CostKind != TTI::TCK_RecipThroughput) return Opcode == Instruction::PHI ? 0 : 1; assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -163,7 +163,8 @@ ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); - unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef Indices = {}) const; @@ -253,7 +254,8 @@ bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -39,7 +39,7 @@ static cl::opt UnrollThresholdIf( "amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), - cl::init(150), cl::Hidden); + cl::init(200), cl::Hidden); static cl::opt UnrollRuntimeLocal( "amdgpu-unroll-runtime-local", @@ -106,6 +106,10 @@ UP.MaxCount = std::numeric_limits::max(); UP.Partial = true; + // Conditional branch in a loop back edge needs 3 additional exec + // manipulations in average. + UP.BEInsns += 3; + // TODO: Do we want runtime unrolling? // Maximum alloca size than can fit registers. Reserve 16 registers. @@ -809,18 +813,37 @@ } unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode, - TTI::TargetCostKind CostKind) { - if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) - return Opcode == Instruction::PHI ? 0 : 1; - - // XXX - For some reason this isn't called for switch. + TTI::TargetCostKind CostKind, + const Instruction *I) { + assert((I == nullptr || I->getOpcode() == Opcode) && + "Opcode should reflect passed instruction."); + const bool SCost = + (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency); + const int CBrCost = SCost ? 5 : 7; switch (Opcode) { - case Instruction::Br: + case Instruction::Br: { + // Branch instruction takes about 4 slots on gfx900. + auto BI = dyn_cast_or_null(I); + if (BI && BI->isUnconditional()) + return SCost ? 1 : 4; + // Suppose conditional branch takes additional 3 exec manipulations + // instructions in average. + return CBrCost; + } + case Instruction::Switch: { + auto SI = dyn_cast_or_null(I); + // Each case (including default) takes 1 cmp + 1 cbr instructions in + // average. + return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1); + } case Instruction::Ret: - return 10; - default: - return BaseT::getCFInstrCost(Opcode, CostKind); + return SCost ? 1 : 10; + case Instruction::PHI: + // TODO: 1. A prediction phi won't be eliminated? + // 2. Estimate data copy instructions in this case. + return 1; } + return BaseT::getCFInstrCost(Opcode, CostKind, I); } int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, @@ -1292,7 +1315,8 @@ } unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode, - TTI::TargetCostKind CostKind) { + TTI::TargetCostKind CostKind, + const Instruction *I) { if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) return Opcode == Instruction::PHI ? 0 : 1; @@ -1302,7 +1326,7 @@ case Instruction::Ret: return 10; default: - return BaseT::getCFInstrCost(Opcode, CostKind); + return BaseT::getCFInstrCost(Opcode, CostKind, I); } } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -198,8 +198,8 @@ bool shouldExpandReduction(const IntrinsicInst *II) const { return false; } - int getCFInstrCost(unsigned Opcode, - TTI::TargetCostKind CostKind); + int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -379,7 +379,8 @@ return getIntImmCost(Imm, Ty, CostKind); } -int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { +int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I) { if (CostKind == TTI::TCK_RecipThroughput && (ST->hasNEON() || ST->hasMVEIntegerOps())) { // FIXME: The vectorizer is highly sensistive to the cost of these @@ -388,7 +389,7 @@ // vector targets. return 0; } - return BaseT::getCFInstrCost(Opcode, CostKind); + return BaseT::getCFInstrCost(Opcode, CostKind, I); } int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -153,7 +153,8 @@ const Instruction *I = nullptr); unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) { return 1; } diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -112,7 +112,8 @@ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); + int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1000,11 +1000,12 @@ nullptr); } -int PPCTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { +int PPCTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I) { if (CostKind != TTI::TCK_RecipThroughput) return Opcode == Instruction::PHI ? 0 : 1; // Branches are assumed to be predicted. - return CostKind == TTI::TCK_RecipThroughput ? 0 : 1; + return 0; } int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -203,7 +203,8 @@ int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); - unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4076,12 +4076,13 @@ return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); } -unsigned -X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { +unsigned X86TTIImpl::getCFInstrCost(unsigned Opcode, + TTI::TargetCostKind CostKind, + const Instruction *I) { if (CostKind != TTI::TCK_RecipThroughput) return Opcode == Instruction::PHI ? 0 : 1; // Branches are assumed to be predicted. - return CostKind == TTI::TCK_RecipThroughput ? 0 : 1; + return 0; } int X86TTIImpl::getGatherOverhead() const { diff --git a/llvm/test/Analysis/CostModel/AMDGPU/br.ll b/llvm/test/Analysis/CostModel/AMDGPU/br.ll deleted file mode 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/br.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s - -; CHECK: 'test_br_cost' -; CHECK: estimated cost of 10 for instruction: br i1 -; CHECK: estimated cost of 10 for instruction: br label -; CHECK: estimated cost of 10 for instruction: ret void -define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { -bb0: - br i1 undef, label %bb1, label %bb2 - -bb1: - %vec = load i32, i32 addrspace(1)* %vaddr - %add = add i32 %vec, %b - store i32 %add, i32 addrspace(1)* %out - br label %bb2 - -bb2: - ret void - -} - -; CHECK: 'test_switch_cost' -; CHECK: estimated cost of -1 for instruction: switch -define amdgpu_kernel void @test_switch_cost(i32 %a) #0 { -entry: - switch i32 %a, label %default [ - i32 0, label %case0 - i32 1, label %case1 - ] - -case0: - store volatile i32 undef, i32 addrspace(1)* undef - ret void - -case1: - store volatile i32 undef, i32 addrspace(1)* undef - ret void - -default: - store volatile i32 undef, i32 addrspace(1)* undef - ret void - -end: - ret void -} diff --git a/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll b/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll @@ -0,0 +1,52 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck --check-prefixes=ALL,SPEED %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck --check-prefixes=ALL,SIZE %s + +; ALL-LABEL: 'test_br_cost' +; SPEED: estimated cost of 7 for instruction: br i1 +; SPEED: estimated cost of 4 for instruction: br label +; SPEED: estimated cost of 1 for instruction: %phi = phi i32 [ +; SPEED: estimated cost of 10 for instruction: ret void +; SIZE: estimated cost of 5 for instruction: br i1 +; SIZE: estimated cost of 1 for instruction: br label +; SIZE: estimated cost of 1 for instruction: %phi = phi i32 [ +; SIZE: estimated cost of 1 for instruction: ret void +define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { +bb0: + br i1 undef, label %bb1, label %bb2 + +bb1: + %vec = load i32, i32 addrspace(1)* %vaddr + %add = add i32 %vec, %b + store i32 %add, i32 addrspace(1)* %out + br label %bb2 + +bb2: + %phi = phi i32 [ %b, %bb0 ], [ %add, %bb1 ] + ret void +} + +; ALL-LABEL: 'test_switch_cost' +; SPEED: estimated cost of 24 for instruction: switch +; SIZE: estimated cost of 18 for instruction: switch +define amdgpu_kernel void @test_switch_cost(i32 %a) #0 { +entry: + switch i32 %a, label %default [ + i32 0, label %case0 + i32 1, label %case1 + ] + +case0: + store volatile i32 undef, i32 addrspace(1)* undef + ret void + +case1: + store volatile i32 undef, i32 addrspace(1)* undef + ret void + +default: + store volatile i32 undef, i32 addrspace(1)* undef + ret void + +end: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/unroll.ll b/llvm/test/CodeGen/AMDGPU/unroll.ll --- a/llvm/test/CodeGen/AMDGPU/unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/unroll.ll @@ -81,8 +81,7 @@ for.body: ; preds = %entry, %for.inc %i1 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %and = and i32 %i1, 1 - %tobool = icmp eq i32 %and, 0 + %tobool = icmp eq i32 %i1, 0 br i1 %tobool, label %for.inc, label %if.then if.then: ; preds = %for.body @@ -93,7 +92,7 @@ for.inc: ; preds = %for.body, %if.then %inc = add nuw nsw i32 %i1, 1 - %cmp = icmp ult i32 %inc, 48 + %cmp = icmp ult i32 %inc, 38 br i1 %cmp, label %for.body, label %for.end for.end: ; preds = %for.cond diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-addrspacecast.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-addrspacecast.ll --- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-addrspacecast.ll +++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-addrspacecast.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -loop-unroll -unroll-threshold=75 -unroll-peel-count=0 -unroll-allow-partial=false -unroll-max-iteration-count-to-analyze=16 < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -loop-unroll -unroll-threshold=49 -unroll-peel-count=0 -unroll-allow-partial=false -unroll-max-iteration-count-to-analyze=16 < %s | FileCheck %s ; CHECK-LABEL: @test_func_addrspacecast_cost_noop( ; CHECK-NOT: br i1