Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -277,6 +277,10 @@ return 1; } + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { + return 1; + } + unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Ty, int Index, Type *SubTp) { return 1; @@ -499,6 +503,24 @@ Operator::getOpcode(U), U->getType(), U->getNumOperands() == 1 ? U->getOperand(0)->getType() : nullptr); } + + /// Estimate the overhead of scalarizing an instruction. Insert and Extract + /// are set if the result needs to be inserted and/or extracted from vectors. + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { + assert(Ty->isVectorTy() && "Can only scalarize vectors"); + unsigned Cost = 0; + + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + if (Insert) + Cost += static_cast(this) + ->getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (Extract) + Cost += static_cast(this) + ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); + } + + return Cost; + } }; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -42,24 +42,6 @@ typedef TargetTransformInfoImplCRTPBase BaseT; typedef TargetTransformInfo TTI; - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { - assert(Ty->isVectorTy() && "Can only scalarize vectors"); - unsigned Cost = 0; - - for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { - if (Insert) - Cost += static_cast(this) - ->getVectorInstrCost(Instruction::InsertElement, Ty, i); - if (Extract) - Cost += static_cast(this) - ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); - } - - return Cost; - } - /// Estimate the cost overhead of SK_Alternate shuffle. unsigned getAltShuffleOverhead(Type *Ty) { assert(Ty->isVectorTy() && "Can only shuffle vectors"); @@ -320,7 +302,9 @@ // return the cost of multiple scalar invocation plus the cost of // inserting // and extracting the values. - return getScalarizationOverhead(Ty, true, true) + Num * Cost; + unsigned ScalarizeCost + = static_cast(this)->getScalarizationOverhead(Ty, true, true); + return ScalarizeCost + Num * Cost; } // We don't know anything about this scalar instruction. @@ -411,19 +395,23 @@ // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(Dst, true, true) + Num * Cost; + unsigned ScalarizeCost + = static_cast(this)->getScalarizationOverhead(Dst, true, true); + return ScalarizeCost + Num * Cost; } // We already handled vector-to-vector and scalar-to-scalar conversions. // This // is where we handle bitcast between vectors and scalars. We need to assume // that the conversion is scalarized in one way or another. - if (Opcode == Instruction::BitCast) + if (Opcode == Instruction::BitCast) { + T *This = static_cast(this); // Illegal bitcasts are done by storing and loading from a stack slot. - return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true) - : 0) + - (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false) - : 0); + return (Src->isVectorTy() ? + This->getScalarizationOverhead(Src, false, true) : 0) + + (Dst->isVectorTy() ? + This->getScalarizationOverhead(Dst, true, false) : 0); + } llvm_unreachable("Unhandled cast"); } @@ -464,7 +452,9 @@ // Return the cost of multiple scalar invocation plus the cost of // inserting // and extracting the values. - return getScalarizationOverhead(ValTy, true, false) + Num * Cost; + unsigned ScalarizeCost + = static_cast(this)->getScalarizationOverhead(ValTy, true, false); + return ScalarizeCost + Num * Cost; } // Unknown scalar opcode. @@ -501,8 +491,11 @@ if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { // This is a vector load/store for some illegal type that is scalarized. // We must account for the cost of building or decomposing the vector. - Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store, - Opcode == Instruction::Store); + + unsigned ScalarizeCost = static_cast(this)-> + getScalarizationOverhead(Src, Opcode != Instruction::Store, + Opcode == Instruction::Store); + Cost += ScalarizeCost; } } @@ -590,7 +583,8 @@ unsigned ScalarCalls = 1; Type *ScalarRetTy = RetTy; if (RetTy->isVectorTy()) { - ScalarizationCost = getScalarizationOverhead(RetTy, true, false); + ScalarizationCost + = static_cast(this)->getScalarizationOverhead(RetTy, true, false); ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); ScalarRetTy = RetTy->getScalarType(); } @@ -598,7 +592,8 @@ for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { Type *Ty = Tys[i]; if (Ty->isVectorTy()) { - ScalarizationCost += getScalarizationOverhead(Ty, false, true); + ScalarizationCost += + static_cast(this)->getScalarizationOverhead(Ty, false, true); ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements()); Ty = Ty->getScalarType(); } @@ -725,7 +720,8 @@ // this will emit a costly libcall, adding call overhead and spills. Make it // very expensive. if (RetTy->isVectorTy()) { - unsigned ScalarizationCost = getScalarizationOverhead(RetTy, true, false); + unsigned ScalarizationCost + = static_cast(this)->getScalarizationOverhead(RetTy, true, false); unsigned ScalarCalls = RetTy->getVectorNumElements(); SmallVector ScalarTys; for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { @@ -738,7 +734,9 @@ IID, RetTy->getScalarType(), ScalarTys); for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { if (Tys[i]->isVectorTy()) { - ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); + ScalarizationCost += + static_cast(this)->getScalarizationOverhead(Tys[i], false, + true); ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements()); } } @@ -784,7 +782,9 @@ NumReduxLevels * (IsPairwise + 1) * static_cast(this) ->getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts / 2, Ty); - return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true); + unsigned ScalarizeCost + = static_cast(this)->getScalarizationOverhead(Ty, false, true); + return ShuffleCost + ArithCost + ScalarizeCost; } /// @} Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -34,10 +34,6 @@ const AArch64Subtarget *ST; const AArch64TargetLowering *TLI; - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); - const AArch64Subtarget *getST() const { return ST; } const AArch64TargetLowering *getTLI() const { return TLI; } Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -61,6 +61,10 @@ unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { + return 0; + } + int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); bool isSourceOfDivergence(const Value *V) const; }; Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -33,10 +33,6 @@ const ARMSubtarget *ST; const ARMTargetLowering *TLI; - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); - const ARMSubtarget *getST() const { return ST; } const ARMTargetLowering *getTLI() const { return TLI; } Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -33,8 +33,6 @@ const X86Subtarget *ST; const X86TargetLowering *TLI; - int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); - const X86Subtarget *getST() const { return ST; } const X86TargetLowering *getTLI() const { return TLI; } Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -934,20 +934,6 @@ return BaseT::getVectorInstrCost(Opcode, Val, Index); } -int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { - assert (Ty->isVectorTy() && "Can only scalarize vectors"); - int Cost = 0; - - for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { - if (Insert) - Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i); - if (Extract) - Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i); - } - - return Cost; -} - int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) { // Handle non-power-of-two vectors such as <3 x float> Index: test/Analysis/CostModel/AMDGPU/add.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AMDGPU/add.ll @@ -0,0 +1,56 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s + +; CHECK: 'add_i32' +; CHECK: estimated cost of 1 for {{.*}} add i32 +define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { + %vec = load i32, i32 addrspace(1)* %vaddr + %add = add i32 %vec, %b + store i32 %add, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'add_v2i32' +; CHECK: estimated cost of 2 for {{.*}} add <2 x i32> +define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 { + %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr + %add = add <2 x i32> %vec, %b + store <2 x i32> %add, <2 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'add_v3i32' +; CHECK: estimated cost of 3 for {{.*}} add <3 x i32> +define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { + %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr + %add = add <3 x i32> %vec, %b + store <3 x i32> %add, <3 x i32> addrspace(1)* %out + ret void +} + +; CHECK: 'add_i64' +; CHECK: estimated cost of 1 for {{.*}} add i64 +define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { + %vec = load i64, i64 addrspace(1)* %vaddr + %add = add i64 %vec, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + +; CHECK: 'add_v2i64' +; CHECK: estimated cost of 2 for {{.*}} add <2 x i64> +define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 { + %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr + %add = add <2 x i64> %vec, %b + store <2 x i64> %add, <2 x i64> addrspace(1)* %out + ret void +} + +; CHECK: 'add_v3i64' +; CHECK: estimated cost of 3 for {{.*}} add <3 x i64> +define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 { + %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr + %add = add <3 x i64> %vec, %b + store <3 x i64> %add, <3 x i64> addrspace(1)* %out + ret void +} + Index: test/Analysis/CostModel/AMDGPU/fabs.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fabs.ll +++ test/Analysis/CostModel/AMDGPU/fabs.ll @@ -10,7 +10,7 @@ } ; CHECK: 'fabs_v2f32' -; CHECK: estimated cost of 2 for {{.*}} call <2 x float> @llvm.fabs.v2f32 +; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32 define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1 @@ -19,7 +19,7 @@ } ; CHECK: 'fabs_v3f32' -; CHECK: estimated cost of 3 for {{.*}} call <3 x float> @llvm.fabs.v3f32 +; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32 define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1 @@ -37,7 +37,7 @@ } ; CHECK: 'fabs_v2f64' -; CHECK: estimated cost of 2 for {{.*}} call <2 x double> @llvm.fabs.v2f64 +; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64 define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1 @@ -46,7 +46,7 @@ } ; CHECK: 'fabs_v3f64' -; CHECK: estimated cost of 3 for {{.*}} call <3 x double> @llvm.fabs.v3f64 +; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64 define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1