Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -59,6 +59,8 @@ getHalfRateInstrCost() : getQuarterRateInstrCost(); } + int getSimpleIntrinsicCost(MVT::SimpleValueType VT, unsigned IID) const; + public: explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), @@ -102,6 +104,15 @@ unsigned getCFInstrCost(unsigned Opcode); int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); + + unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, + ArrayRef Tys, + FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX); + int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, + ArrayRef Args, FastMathFlags FMF, + unsigned VF = 1); + bool isSourceOfDivergence(const Value *V) const; unsigned getFlatAddressSpace() const { Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -254,6 +254,93 @@ return 64; } +// Helper function for getIntrinsicCost and getIntrinsicInstrCost. +int AMDGPUTTIImpl::getSimpleIntrinsicCost(MVT::SimpleValueType VT, + unsigned IID) const { + switch (IID) { + case Intrinsic::fma: { + if (VT == MVT::f32) { + if (ST->hasFastFMAF32()) + return getFullRateInstrCost(); + } else if (VT == MVT::f16) { + if (ST->has16BitInsts()) + return getFullRateInstrCost(); + + // TODO: Really need cost of conversions + f32 FMA + } else if (VT == MVT::v2f16) { + llvm_unreachable("packed types handled separately"); + } + + return getQuarterRateInstrCost(); + } + case Intrinsic::floor: { + const int FullRateCost = getFullRateInstrCost(); + if (VT == MVT::f32 || VT == MVT::f16) + return FullRateCost; + + const int FP64RateCost = get64BitInstrCost(); + if (ST->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + return FP64RateCost; + + int Cost = getSimpleIntrinsicCost(VT, Intrinsic::trunc); + Cost += 2 * FullRateCost; // setcc x2 i32 + Cost += FullRateCost; // and i1 + Cost += 2 * FullRateCost; // select + Cost += FP64RateCost; // fadd + + return Cost; + } + case Intrinsic::trunc: { + const int FullRateCost = getFullRateInstrCost(); + if (VT == MVT::f32 || VT == MVT::f16) + return FullRateCost; + + const int FP64RateCost = get64BitInstrCost(); + if (ST->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + return FP64RateCost; + + int Cost = FullRateCost; // bfe i32 + Cost += FullRateCost; // sub i32 + Cost += FP64RateCost; // sra i64 + Cost += 2 * FullRateCost; // not i64 + Cost += FullRateCost; // and i32 + Cost += 2 * FullRateCost; // setcc i32 x2 + Cost += 2 * FullRateCost; // and i64 + Cost += 4 * FullRateCost; // select x2 i64 + + return Cost; + } + case Intrinsic::ctlz: + case Intrinsic::cttz: { + // FIXME: This sees the legalized type, so doesn't work correctly for + // i8/i16. + const int FullRateCost = getFullRateInstrCost(); + if (VT == MVT::i32) + return FullRateCost; + // i64 requires 2 instructions. Illegal types require an additional add. + return 2 * FullRateCost; + } + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::amdgcn_workgroup_id_x: + case Intrinsic::amdgcn_workgroup_id_y: + case Intrinsic::amdgcn_workgroup_id_z: + case Intrinsic::amdgcn_kernarg_segment_ptr: + case Intrinsic::amdgcn_implicitarg_ptr: + case Intrinsic::amdgcn_implicit_buffer_ptr: + case Intrinsic::amdgcn_queue_ptr: + case Intrinsic::amdgcn_dispatch_ptr: + case Intrinsic::amdgcn_dispatch_id: + case Intrinsic::amdgcn_groupstaticsize: + case Intrinsic::amdgcn_unreachable: + case Intrinsic::amdgcn_wave_barrier: + return 0; + default: + return -1; + } +} + int AMDGPUTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, @@ -375,6 +462,44 @@ } } +unsigned AMDGPUTTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, + ArrayRef Tys, + FastMathFlags FMF, + unsigned ScalarizationCostPassed) { + EVT OrigTy = TLI->getValueType(DL, RetTy); + if (!OrigTy.isSimple()) + return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, + ScalarizationCostPassed); + + std::pair LT = TLI->getTypeLegalizationCost(DL, RetTy); + if (LT.second == MVT::v2f16) { + assert(ST->hasVOP3PInsts()); + switch (IID) { + case Intrinsic::fma: + case Intrinsic::fmuladd: + return LT.first * getFullRateInstrCost(); + default: + break; + } + } + + MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; + + const int Cost = getSimpleIntrinsicCost(SLT, IID); + if (Cost == -1) + return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, + ScalarizationCostPassed); + + unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1; + return Cost * LT.first * NElts; +} + +int AMDGPUTTIImpl::getIntrinsicInstrCost( + Intrinsic::ID IID, Type *RetTy, + ArrayRef Args, FastMathFlags FMF, unsigned VF) { + return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); +} + static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) { switch (I->getIntrinsicID()) { case Intrinsic::amdgcn_workitem_id_x: Index: test/Analysis/CostModel/AMDGPU/ctlz.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AMDGPU/ctlz.ll @@ -0,0 +1,87 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,VI %s +; FIXME: CI, VI should have same costs + +declare i8 @llvm.ctlz.i8(i8, i1) #0 +declare i16 @llvm.ctlz.i16(i16, i1) #0 +declare i32 @llvm.ctlz.i32(i32, i1) #0 +declare i64 @llvm.ctlz.i64(i64, i1) #0 + +; GCN-LABEL: 'ctlz_i32' +; GCN: estimated cost of 1 for {{.*}} call i32 @llvm.ctlz.i32 +define void @ctlz_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr) #1 { + %vec = load i32, i32 addrspace(1)* %vaddr + %trunc = call i32 @llvm.ctlz.i32(i32 %vec, i1 false) + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'ctlz_zero_undef_i32' +; GCN: estimated cost of 1 for {{.*}} call i32 @llvm.ctlz.i32 +define void @ctlz_zero_undef_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr) #1 { + %vec = load i32, i32 addrspace(1)* %vaddr + %trunc = call i32 @llvm.ctlz.i32(i32 %vec, i1 true) + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'ctlz_i64' +; GCN: estimated cost of 2 for {{.*}} call i64 @llvm.ctlz.i64 +define void @ctlz_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr) #1 { + %vec = load i64, i64 addrspace(1)* %vaddr + %trunc = call i64 @llvm.ctlz.i64(i64 %vec, i1 false) + store i64 %trunc, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'ctlz_zero_undef_i64' +; GCN: estimated cost of 2 for {{.*}} call i64 @llvm.ctlz.i64 +define void @ctlz_zero_undef_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr) #1 { + %vec = load i64, i64 addrspace(1)* %vaddr + %trunc = call i64 @llvm.ctlz.i64(i64 %vec, i1 true) + store i64 %trunc, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'ctlz_i8' +; CI: estimated cost of 1 for {{.*}} call i8 @llvm.ctlz.i8 +; VI: estimated cost of 2 for {{.*}} call i8 @llvm.ctlz.i8 +define void @ctlz_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %vaddr) #1 { + %vec = load i8, i8 addrspace(1)* %vaddr + %trunc = call i8 @llvm.ctlz.i8(i8 %vec, i1 false) + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'ctlz_zero_undef_i8' +; CI: estimated cost of 1 for {{.*}} call i8 @llvm.ctlz.i8 +; VI: estimated cost of 2 for {{.*}} call i8 @llvm.ctlz.i8 +define void @ctlz_zero_undef_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %vaddr) #1 { + %vec = load i8, i8 addrspace(1)* %vaddr + %trunc = call i8 @llvm.ctlz.i8(i8 %vec, i1 true) + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'ctlz_i16' +; CI: estimated cost of 1 for {{.*}} call i16 @llvm.ctlz.i16 +; VI: estimated cost of 2 for {{.*}} call i16 @llvm.ctlz.i16 +define void @ctlz_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr) #1 { + %vec = load i16, i16 addrspace(1)* %vaddr + %trunc = call i16 @llvm.ctlz.i16(i16 %vec, i1 false) + store i16 %trunc, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'ctlz_zero_undef_i16' +; CI: estimated cost of 1 for {{.*}} call i16 @llvm.ctlz.i16 +; VI: estimated cost of 2 for {{.*}} call i16 @llvm.ctlz.i16 +define void @ctlz_zero_undef_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr) #1 { + %vec = load i16, i16 addrspace(1)* %vaddr + %trunc = call i16 @llvm.ctlz.i16(i16 %vec, i1 true) + store i16 %trunc, i16 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/Analysis/CostModel/AMDGPU/cttz.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AMDGPU/cttz.ll @@ -0,0 +1,86 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,VI %s + +declare i8 @llvm.cttz.i8(i8, i1) #0 +declare i16 @llvm.cttz.i16(i16, i1) #0 +declare i32 @llvm.cttz.i32(i32, i1) #0 +declare i64 @llvm.cttz.i64(i64, i1) #0 + +; GCN-LABEL: 'cttz_i32' +; GCN: estimated cost of 1 for {{.*}} call i32 @llvm.cttz.i32 +define void @cttz_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr) #1 { + %vec = load i32, i32 addrspace(1)* %vaddr + %trunc = call i32 @llvm.cttz.i32(i32 %vec, i1 false) + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'cttz_zero_undef_i32' +; GCN: estimated cost of 1 for {{.*}} call i32 @llvm.cttz.i32 +define void @cttz_zero_undef_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr) #1 { + %vec = load i32, i32 addrspace(1)* %vaddr + %trunc = call i32 @llvm.cttz.i32(i32 %vec, i1 true) + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'cttz_i64' +; GCN: estimated cost of 2 for {{.*}} call i64 @llvm.cttz.i64 +define void @cttz_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr) #1 { + %vec = load i64, i64 addrspace(1)* %vaddr + %trunc = call i64 @llvm.cttz.i64(i64 %vec, i1 false) + store i64 %trunc, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'cttz_zero_undef_i64' +; GCN: estimated cost of 2 for {{.*}} call i64 @llvm.cttz.i64 +define void @cttz_zero_undef_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr) #1 { + %vec = load i64, i64 addrspace(1)* %vaddr + %trunc = call i64 @llvm.cttz.i64(i64 %vec, i1 true) + store i64 %trunc, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'cttz_i8' +; CI: estimated cost of 1 for {{.*}} call i8 @llvm.cttz.i8 +; VI: estimated cost of 2 for {{.*}} call i8 @llvm.cttz.i8 +define void @cttz_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %vaddr) #1 { + %vec = load i8, i8 addrspace(1)* %vaddr + %trunc = call i8 @llvm.cttz.i8(i8 %vec, i1 false) + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'cttz_zero_undef_i8' +; CI: estimated cost of 1 for {{.*}} call i8 @llvm.cttz.i8 +; VI: estimated cost of 2 for {{.*}} call i8 @llvm.cttz.i8 +define void @cttz_zero_undef_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %vaddr) #1 { + %vec = load i8, i8 addrspace(1)* %vaddr + %trunc = call i8 @llvm.cttz.i8(i8 %vec, i1 true) + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'cttz_i16' +; CI: estimated cost of 1 for {{.*}} call i16 @llvm.cttz.i16 +; VI: estimated cost of 2 for {{.*}} call i16 @llvm.cttz.i16 +define void @cttz_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr) #1 { + %vec = load i16, i16 addrspace(1)* %vaddr + %trunc = call i16 @llvm.cttz.i16(i16 %vec, i1 false) + store i16 %trunc, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'cttz_zero_undef_i16' +; CI: estimated cost of 1 for {{.*}} call i16 @llvm.cttz.i16 +; VI: estimated cost of 2 for {{.*}} call i16 @llvm.cttz.i16 +define void @cttz_zero_undef_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr) #1 { + %vec = load i16, i16 addrspace(1)* %vaddr + %trunc = call i16 @llvm.cttz.i16(i16 %vec, i1 true) + store i16 %trunc, i16 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/Analysis/CostModel/AMDGPU/ffloor.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AMDGPU/ffloor.ll @@ -0,0 +1,113 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefix=SI-FASTFP64 -check-prefix=ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=verde < %s | FileCheck -check-prefix=SI-SLOWFP64 -check-prefix=ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefix=CI-FASTFP64 -check-prefix=ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=bonaire < %s | FileCheck -check-prefix=CI-SLOWFP64 -check-prefix=ALL %s + +; ALL: 'floor_f32' +; ALL: estimated cost of 1 for {{.*}} call float @llvm.floor.f32 +define void @floor_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 { + %vec = load float, float addrspace(1)* %vaddr + %floor = call float @llvm.floor.f32(float %vec) #1 + store float %floor, float addrspace(1)* %out + ret void +} + +; ALL: 'floor_v2f32' +; ALL: estimated cost of 2 for {{.*}} call <2 x float> @llvm.floor.v2f32 +define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { + %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr + %floor = call <2 x float> @llvm.floor.v2f32(<2 x float> %vec) #1 + store <2 x float> %floor, <2 x float> addrspace(1)* %out + ret void +} + +; ALL: 'floor_v3f32' +; ALL: estimated cost of 3 for {{.*}} call <3 x float> @llvm.floor.v3f32 +define void @floor_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 { + %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr + %floor = call <3 x float> @llvm.floor.v3f32(<3 x float> %vec) #1 + store <3 x float> %floor, <3 x float> addrspace(1)* %out + ret void +} + +; ALL: 'floor_f64' +; SI-FASTFP64: estimated cost of 22 for {{.*}} call double @llvm.floor.f64 +; SI-SLOWFP64: estimated cost of 24 for {{.*}} call double @llvm.floor.f64 + +; CI-FASTFP64: estimated cost of 2 for {{.*}} call double @llvm.floor.f64 +; CI-SLOWFP64: estimated cost of 3 for {{.*}} call double @llvm.floor.f64 +define void @floor_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { + %vec = load double, double addrspace(1)* %vaddr + %floor = call double @llvm.floor.f64(double %vec) #1 + store double %floor, double addrspace(1)* %out + ret void +} + +; ALL: 'floor_v2f64' +; SI-FASTFP64: estimated cost of 44 for {{.*}} call <2 x double> @llvm.floor.v2f64 +; SI-SLOWFP64: estimated cost of 48 for {{.*}} call <2 x double> @llvm.floor.v2f64 + +; CI-FASTFP64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.floor.v2f64 +; CI-SLOWFP64: estimated cost of 6 for {{.*}} call <2 x double> @llvm.floor.v2f64 +define void @floor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 { + %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr + %floor = call <2 x double> @llvm.floor.v2f64(<2 x double> %vec) #1 + store <2 x double> %floor, <2 x double> addrspace(1)* %out + ret void +} + +; ALL: 'floor_v3f64' +; SI-FASTFP64: estimated cost of 66 for {{.*}} call <3 x double> @llvm.floor.v3f64 +; SI-SLOWFP64: estimated cost of 72 for {{.*}} call <3 x double> @llvm.floor.v3f64 + +; CI-FASTFP64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.floor.v3f64 +; CI-SLOWFP64: estimated cost of 9 for {{.*}} call <3 x double> @llvm.floor.v3f64 +define void @floor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 { + %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr + %floor = call <3 x double> @llvm.floor.v3f64(<3 x double> %vec) #1 + store <3 x double> %floor, <3 x double> addrspace(1)* %out + ret void +} + +; ALL: 'floor_f16' +; ALL: estimated cost of 1 for {{.*}} call half @llvm.floor.f16 +define void @floor_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 { + %vec = load half, half addrspace(1)* %vaddr + %floor = call half @llvm.floor.f16(half %vec) #1 + store half %floor, half addrspace(1)* %out + ret void +} + +; ALL: 'floor_v2f16' +; ALL: estimated cost of 2 for {{.*}} call <2 x half> @llvm.floor.v2f16 +define void @floor_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 { + %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr + %floor = call <2 x half> @llvm.floor.v2f16(<2 x half> %vec) #1 + store <2 x half> %floor, <2 x half> addrspace(1)* %out + ret void +} + +; FIXME: Should be 3 +; ALL: 'floor_v3f16' +; ALL: estimated cost of 8 for {{.*}} call <3 x half> @llvm.floor.v3f16 +define void @floor_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 { + %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr + %floor = call <3 x half> @llvm.floor.v3f16(<3 x half> %vec) #1 + store <3 x half> %floor, <3 x half> addrspace(1)* %out + ret void +} + +declare float @llvm.floor.f32(float) #1 +declare <2 x float> @llvm.floor.v2f32(<2 x float>) #1 +declare <3 x float> @llvm.floor.v3f32(<3 x float>) #1 + +declare double @llvm.floor.f64(double) #1 +declare <2 x double> @llvm.floor.v2f64(<2 x double>) #1 +declare <3 x double> @llvm.floor.v3f64(<3 x double>) #1 + +declare half @llvm.floor.f16(half) #1 +declare <2 x half> @llvm.floor.v2f16(<2 x half>) #1 +declare <3 x half> @llvm.floor.v3f16(<3 x half>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/Analysis/CostModel/AMDGPU/fma.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AMDGPU/fma.ll @@ -0,0 +1,83 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,FASTFMA32,SICI,SICI-FASTFMA %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=verde < %s | FileCheck -check-prefixes=GCN,SLOWFMA32,SICI,SICI-SLOWFMA %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,FASTFMA32,SICI,SICI-FASTMFA %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SLOWFMA32,SICI,SICI-SLOWFMA %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,SLOWFMA32,VI %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,FASTFMA32,GFX9 %s + +; FASTFMA32: Found an estimated cost of 1 for instruction: %fma = call float @llvm.fma.f32( +; SLOWFMA32: Found an estimated cost of 3 for instruction: %fma = call float @llvm.fma.f32( +define float @fma_f32(float %a, float %b, float %c) #0 { + %fma = call float @llvm.fma.f32(float %a, float %b, float %c) + ret float %fma +} + +; FASTFMA32: Found an estimated cost of 2 for instruction: %fma = call <2 x float> @llvm.fma.v2f32( +; SLOWFMA32: Found an estimated cost of 6 for instruction: %fma = call <2 x float> @llvm.fma.v2f32( +define <2 x float> @fma_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 { + %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) + ret <2 x float> %fma +} + +; GCN: Cost Model: Found an estimated cost of 3 for instruction: %fma = call double @llvm.fma.f64( +define double @fma_f64(double %a, double %b, double %c) #0 { + %fma = call double @llvm.fma.f64(double %a, double %b, double %c) + ret double %fma +} + +; GCN: Found an estimated cost of 6 for instruction: %fma = call <2 x double> @llvm.fma.v2f64( +define <2 x double> @fma_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 { + %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) + ret <2 x double> %fma +} + +; FIXME: Should be expensive for SI because of conversions +; SICI-FASTFMA: Found an estimated cost of 1 for instruction: %fma = call half @llvm.fma.f16( +; SICI-SLOWFMA: Found an estimated cost of 3 for instruction: %fma = call half @llvm.fma.f16( +; VI: Found an estimated cost of 1 for instruction: %fma = call half @llvm.fma.f16( +; GFX9: Found an estimated cost of 1 for instruction: %fma = call half @llvm.fma.f16( +define half @fma_f16(half %a, half %b, half %c) #0 { + %fma = call half @llvm.fma.f16(half %a, half %b, half %c) + ret half %fma +} + +; SICI-FASTFMA: Cost Model: Found an estimated cost of 2 for instruction: %fma = call <2 x half> @llvm.fma.v2f16( +; SICI-SLOWFMA: Cost Model: Found an estimated cost of 6 for instruction: %fma = call <2 x half> @llvm.fma.v2f16( +; VI: Cost Model: Found an estimated cost of 2 for instruction: %fma = call <2 x half> @llvm.fma.v2f16( +; GFX9: Cost Model: Found an estimated cost of 1 for instruction: %fma = call <2 x half> @llvm.fma.v2f16( +define <2 x half> @fma_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { + %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) + ret <2 x half> %fma +} + +; FIXME: gfx9 should be 2 +; SICI: Cost Model: Found an estimated cost of 8 for instruction: %fma = call <3 x half> @llvm.fma.v3f16( +; VI: Cost Model: Found an estimated cost of 8 for instruction: %fma = call <3 x half> @llvm.fma.v3f16( +; GFX9: Cost Model: Found an estimated cost of 4 for instruction: %fma = call <3 x half> @llvm.fma.v3f16( +define <3 x half> @fma_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) #0 { + %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) + ret <3 x half> %fma +} + +; SICI-FASTFMA: Cost Model: Found an estimated cost of 4 for instruction: %fma = call <4 x half> @llvm.fma.v4f16( +; SICI-SLOWFMA: Cost Model: Found an estimated cost of 12 for instruction: %fma = call <4 x half> @llvm.fma.v4f16( +; VI: Cost Model: Found an estimated cost of 4 for instruction: %fma = call <4 x half> @llvm.fma.v4f16( +; GFX9: Cost Model: Found an estimated cost of 2 for instruction: %fma = call <4 x half> @llvm.fma.v4f16( +define <4 x half> @fma_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { + %fma = call <4 x half> @llvm.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) + ret <4 x half> %fma +} + +declare float @llvm.fma.f32(float, float, float) #1 +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 + +declare half @llvm.fma.f16(half, half, half) #1 +declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 +declare <3 x half> @llvm.fma.v3f16(<3 x half>, <3 x half>, <3 x half>) #1 +declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) #1 + +declare double @llvm.fma.f64(double, double, double) #1 +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/Analysis/CostModel/AMDGPU/free-intrinsics.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AMDGPU/free-intrinsics.ll @@ -0,0 +1,60 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s + +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %workitem.id.x = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %workitem.id.y = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %workitem.id.z = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %workgroup.id.x = call i32 @llvm.amdgcn.workgroup.id.x() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %workgroup.id.y = call i32 @llvm.amdgcn.workgroup.id.y() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %workgroup.id.z = call i32 @llvm.amdgcn.workgroup.id.z() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %implicit.buffer.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %queue_ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %dispatch_ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %dispatch.id = call i64 @llvm.amdgcn.dispatch.id() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.amdgcn.unreachable() +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.amdgcn.wave.barrier() + +define void @test() #0 { + %workitem.id.x = call i32 @llvm.amdgcn.workitem.id.x() + %workitem.id.y = call i32 @llvm.amdgcn.workitem.id.y() + %workitem.id.z = call i32 @llvm.amdgcn.workitem.id.z() + %workgroup.id.x = call i32 @llvm.amdgcn.workgroup.id.x() + %workgroup.id.y = call i32 @llvm.amdgcn.workgroup.id.y() + %workgroup.id.z = call i32 @llvm.amdgcn.workgroup.id.z() + + %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() + %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + %implicit.buffer.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %queue_ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() + %dispatch_ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + + %dispatch.id = call i64 @llvm.amdgcn.dispatch.id() + %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() + + call void @llvm.amdgcn.unreachable() + call void @llvm.amdgcn.wave.barrier() + + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare i32 @llvm.amdgcn.workitem.id.y() #1 +declare i32 @llvm.amdgcn.workitem.id.z() #1 +declare i32 @llvm.amdgcn.workgroup.id.x() #1 +declare i32 @llvm.amdgcn.workgroup.id.y() #1 +declare i32 @llvm.amdgcn.workgroup.id.z() #1 +declare i64 @llvm.amdgcn.dispatch.id() #1 +declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #1 +declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #1 +declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #1 +declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1 +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 +declare void @llvm.amdgcn.unreachable() #0 +declare i32 @llvm.amdgcn.groupstaticsize() #1 +declare void @llvm.amdgcn.wave.barrier() #2 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { convergent nounwind } Index: test/Analysis/CostModel/AMDGPU/ftrunc.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AMDGPU/ftrunc.ll @@ -0,0 +1,113 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefix=SI-FASTFP64 -check-prefix=ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=verde < %s | FileCheck -check-prefix=SI-SLOWFP64 -check-prefix=ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefix=CI-FASTFP64 -check-prefix=ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=bonaire < %s | FileCheck -check-prefix=CI-SLOWFP64 -check-prefix=ALL %s + +; ALL: 'trunc_f32' +; ALL: estimated cost of 1 for {{.*}} call float @llvm.trunc.f32 +define void @trunc_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 { + %vec = load float, float addrspace(1)* %vaddr + %trunc = call float @llvm.trunc.f32(float %vec) #1 + store float %trunc, float addrspace(1)* %out + ret void +} + +; ALL: 'trunc_v2f32' +; ALL: estimated cost of 2 for {{.*}} call <2 x float> @llvm.trunc.v2f32 +define void @trunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { + %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr + %trunc = call <2 x float> @llvm.trunc.v2f32(<2 x float> %vec) #1 + store <2 x float> %trunc, <2 x float> addrspace(1)* %out + ret void +} + +; ALL: 'trunc_v3f32' +; ALL: estimated cost of 3 for {{.*}} call <3 x float> @llvm.trunc.v3f32 +define void @trunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 { + %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr + %trunc = call <3 x float> @llvm.trunc.v3f32(<3 x float> %vec) #1 + store <3 x float> %trunc, <3 x float> addrspace(1)* %out + ret void +} + +; ALL: 'trunc_f64' +; SI-FASTFP64: estimated cost of 15 for {{.*}} call double @llvm.trunc.f64 +; SI-SLOWFP64: estimated cost of 16 for {{.*}} call double @llvm.trunc.f64 + +; CI-FASTFP64: estimated cost of 2 for {{.*}} call double @llvm.trunc.f64 +; CI-SLOWFP64: estimated cost of 3 for {{.*}} call double @llvm.trunc.f64 +define void @trunc_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { + %vec = load double, double addrspace(1)* %vaddr + %trunc = call double @llvm.trunc.f64(double %vec) #1 + store double %trunc, double addrspace(1)* %out + ret void +} + +; ALL: 'trunc_v2f64' +; SI-FASTFP64: estimated cost of 30 for {{.*}} call <2 x double> @llvm.trunc.v2f64 +; SI-SLOWFP64: estimated cost of 32 for {{.*}} call <2 x double> @llvm.trunc.v2f64 + +; CI-FASTFP64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.trunc.v2f64 +; CI-SLOWFP64: estimated cost of 6 for {{.*}} call <2 x double> @llvm.trunc.v2f64 +define void @trunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 { + %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr + %trunc = call <2 x double> @llvm.trunc.v2f64(<2 x double> %vec) #1 + store <2 x double> %trunc, <2 x double> addrspace(1)* %out + ret void +} + +; ALL: 'trunc_v3f64' +; SI-FASTFP64: estimated cost of 45 for {{.*}} call <3 x double> @llvm.trunc.v3f64 +; SI-SLOWFP64: estimated cost of 48 for {{.*}} call <3 x double> @llvm.trunc.v3f64 + +; CI-FASTFP64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.trunc.v3f64 +; CI-SLOWFP64: estimated cost of 9 for {{.*}} call <3 x double> @llvm.trunc.v3f64 +define void @trunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 { + %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr + %trunc = call <3 x double> @llvm.trunc.v3f64(<3 x double> %vec) #1 + store <3 x double> %trunc, <3 x double> addrspace(1)* %out + ret void +} + +; ALL: 'trunc_f16' +; ALL: estimated cost of 1 for {{.*}} call half @llvm.trunc.f16 +define void @trunc_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 { + %vec = load half, half addrspace(1)* %vaddr + %trunc = call half @llvm.trunc.f16(half %vec) #1 + store half %trunc, half addrspace(1)* %out + ret void +} + +; ALL: 'trunc_v2f16' +; ALL: estimated cost of 2 for {{.*}} call <2 x half> @llvm.trunc.v2f16 +define void @trunc_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 { + %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr + %trunc = call <2 x half> @llvm.trunc.v2f16(<2 x half> %vec) #1 + store <2 x half> %trunc, <2 x half> addrspace(1)* %out + ret void +} + +; FIXME: Should be 3 +; ALL: 'trunc_v3f16' +; ALL: estimated cost of 8 for {{.*}} call <3 x half> @llvm.trunc.v3f16 +define void @trunc_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 { + %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr + %trunc = call <3 x half> @llvm.trunc.v3f16(<3 x half> %vec) #1 + store <3 x half> %trunc, <3 x half> addrspace(1)* %out + ret void +} + +declare float @llvm.trunc.f32(float) #1 +declare <2 x float> @llvm.trunc.v2f32(<2 x float>) #1 +declare <3 x float> @llvm.trunc.v3f32(<3 x float>) #1 + +declare double @llvm.trunc.f64(double) #1 +declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #1 +declare <3 x double> @llvm.trunc.v3f64(<3 x double>) #1 + +declare half @llvm.trunc.f16(half) #1 +declare <2 x half> @llvm.trunc.v2f16(<2 x half>) #1 +declare <3 x half> @llvm.trunc.v3f16(<3 x half>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone }