Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -68,6 +68,8 @@ TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + unsigned getCFInstrCost(unsigned Opcode); unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -83,6 +83,43 @@ return 64; } +// TODO: Implement getIntrinsicInstrCost +static int getIntrinsicCost(const AMDGPUSubtarget &ST, MVT::SimpleValueType VT, + unsigned IID) { + switch (IID) { + case Intrinsic::fma: { + if (VT == MVT::f32 || VT == MVT::f16) { + if (ST.hasFastFMAF32()) + return TargetTransformInfo::TCC_Basic; + + return 3 * TargetTransformInfo::TCC_Basic; + } + + return 3 * TargetTransformInfo::TCC_Basic; + } + case Intrinsic::floor: { + if (VT == MVT::f32 || VT == MVT::f16) + return TargetTransformInfo::TCC_Basic; + + if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + return 2 * TargetTransformInfo::TCC_Basic; + + return getIntrinsicCost(ST, VT, Intrinsic::trunc) + 7; + } + case Intrinsic::trunc: { + if (VT == MVT::f32 || VT == MVT::f16) + return TargetTransformInfo::TCC_Basic; + + if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + return 2 * TargetTransformInfo::TCC_Basic; + + return 15; + } + default: + return -1; + } +} + int AMDGPUTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, @@ -144,6 +181,95 @@ Opd1PropInfo, Opd2PropInfo); } +int AMDGPUTTIImpl::getCastInstrCost(unsigned Opcode, + Type *Dst, Type *Src) { + if (Opcode != Instruction::FPToSI && + Opcode != Instruction::FPToUI && + Opcode != Instruction::SIToFP && + Opcode != Instruction::UIToFP) + return BaseT::getCastInstrCost(Opcode, Dst, Src); + + EVT SrcTy = TLI->getValueType(DL, Src); + EVT DstTy = TLI->getValueType(DL, Dst); + + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return BaseT::getCastInstrCost(Opcode, Dst, Src); + + std::pair SrcLT = TLI->getTypeLegalizationCost(DL, Src); + std::pair DstLT = TLI->getTypeLegalizationCost(DL, Dst); + assert(SrcLT.first == DstLT.first); + + unsigned NElts = SrcLT.second.isVector() ? + SrcLT.second.getVectorNumElements() : 1; + + MVT::SimpleValueType SSrcLT = SrcLT.second.getScalarType().SimpleTy; + MVT::SimpleValueType SDstLT = DstLT.second.getScalarType().SimpleTy; + + switch (Opcode) { + case Instruction::FPToSI: + case Instruction::FPToUI: { + int Cost = 0; + if (SSrcLT == MVT::f32 || SSrcLT == MVT::f16) { + if (SDstLT == MVT::i64) { + // f32 -> i64 expansion. + // FIXME: This expansion not yet implemented. + return BaseT::getCastInstrCost(Opcode, Dst, Src); + } else { + // f32 -> i32 full rate instruction. + Cost += 1; + } + } else { + assert(SSrcLT == MVT::f64); + + if (SDstLT == MVT::i64) { + // f64 -> i64 expansion. + Cost += ::getIntrinsicCost(*ST, SSrcLT, Intrinsic::trunc); + Cost += ::getIntrinsicCost(*ST, SSrcLT, Intrinsic::floor); + Cost += ::getIntrinsicCost(*ST, SSrcLT, Intrinsic::fma); + Cost += 6; + } else { + // f64 -> i32 half or quarter rate instruction. + Cost += 2; + } + } + + return NElts * SrcLT.first * Cost; + } + case Instruction::SIToFP: + case Instruction::UIToFP: { + int Cost = 0; + if (SDstLT == MVT::f32 || SDstLT == MVT::f16) { + if (SSrcLT == MVT::i64) { + // i64 -> f32 expansion. + // FIXME: This expansion not yet implemented. + return BaseT::getCastInstrCost(Opcode, Dst, Src); + } else { + // i32 -> f32 full rate instruction. + Cost = TargetTransformInfo::TCC_Basic; + } + } else { + // i64 to f64 expansion + if (SSrcLT == MVT::i64) { + // [su]int_to_fp (half or full) + // uint_to_fp (half or full) + // ldexp (half or full) + // fadd (half or full) + Cost = 2 + 2 + 2 + 2; + } else { + // i32 -> f64 half or quarter rate instruction. + Cost = 2; + } + } + + return NElts * SrcLT.first * Cost; + } + default: + break; + } + + return BaseT::getCastInstrCost(Opcode, Dst, Src); +} + unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { // XXX - For some reason this isn't called for switch. switch (Opcode) { Index: test/Analysis/CostModel/AMDGPU/fptosi.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AMDGPU/fptosi.ll @@ -0,0 +1,71 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=COMMON -check-prefix=SI %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=COMMON -check-prefix=CI %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=COMMON -check-prefix=CI %s + +; COMMON: 'fptosi_f32_to_i32' +; COMMON: estimated cost of 1 for {{.*}} fptosi float %val to i32 +define void @fptosi_f32_to_i32(i32 addrspace(1)* %out, float %val) #0 { + %cvt = fptosi float %val to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; COMMON: 'fptosi_v32f32_to_v32i32' +; COMMON: estimated cost of 32 for {{.*}} fptosi <32 x float> %val to <32 x i32> +define void @fptosi_v32f32_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x float> %val) #0 { + %cvt = fptosi <32 x float> %val to <32 x i32> + store <32 x i32> %cvt, <32 x i32> addrspace(1)* %out + ret void +} + +; COMMON: 'fptosi_f64_to_i32' +; COMMON: estimated cost of 2 for {{.*}} fptosi double %val to i32 +define void @fptosi_f64_to_i32(i32 addrspace(1)* %out, double %val) #0 { + %cvt = fptosi double %val to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; COMMON: 'fptosi_f32_to_i64' +; COMMON: estimated cost of 1 for {{.*}} fptosi float %val to i64 +define void @fptosi_f32_to_i64(i64 addrspace(1)* %out, float %val) #0 { + %cvt = fptosi float %val to i64 + store i64 %cvt, i64 addrspace(1)* %out + ret void +} + +; COMMON: 'fptosi_f64_to_i64' +; SI: estimated cost of 46 for {{.*}} fptosi double %val to i64 +define void @fptosi_f64_to_i64(i64 addrspace(1)* %out, double %val) #0 { + %cvt = fptosi double %val to i64 + store i64 %cvt, i64 addrspace(1)* %out + ret void +} + +; COMMON: 'fptosi_v3f64_to_v3i64' +; SI: estimated cost of 138 for {{.*}} fptosi <3 x double> %val to <3 x i64> +; CI: estimated cost of 39 for {{.*}} fptosi <3 x double> %val to <3 x i64> +define void @fptosi_v3f64_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x double> %val) #0 { + %cvt = fptosi <3 x double> %val to <3 x i64> + store <3 x i64> %cvt, <3 x i64> addrspace(1)* %out + ret void +} + +; COMMON: 'fptosi_f16_to_i32' +; COMMON: estimated cost of 1 for {{.*}} fptosi half %val to i32 +define void @fptosi_f16_to_i32(i32 addrspace(1)* %out, half %val) #0 { + %cvt = fptosi half %val to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should probably be a bit higher +; COMMON: 'fptosi_f16_to_i64' +; COMMON: estimated cost of 1 for {{.*}} fptosi half %val to i64 +define void @fptosi_f16_to_i64(i64 addrspace(1)* %out, half %val) #0 { + %cvt = fptosi half %val to i64 + store i64 %cvt, i64 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } Index: test/Analysis/CostModel/AMDGPU/sitofp.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AMDGPU/sitofp.ll @@ -0,0 +1,51 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=COMMON %s + +; COMMON: 'sitofp_i32_to_f32' +; COMMON: estimated cost of 1 for {{.*}} sitofp i32 %val to float +define void @sitofp_i32_to_f32(float addrspace(1)* %out, i32 %val) #0 { + %cvt = sitofp i32 %val to float + store float %cvt, float addrspace(1)* %out + ret void +} + +; COMMON: 'sitofp_v32i32_to_v32f32' +; COMMON: estimated cost of 32 for {{.*}} sitofp <32 x i32> %val to <32 x float> +define void @sitofp_v32i32_to_v32f32(<32 x float> addrspace(1)* %out, <32 x i32> %val) #0 { + %cvt = sitofp <32 x i32> %val to <32 x float> + store <32 x float> %cvt, <32 x float> addrspace(1)* %out + ret void +} + +; COMMON: 'sitofp_i64_to_f32' +; COMMON: estimated cost of 1 for {{.*}} sitofp i64 %val to float +define void @sitofp_i64_to_f32(float addrspace(1)* %out, i64 %val) #0 { + %cvt = sitofp i64 %val to float + store float %cvt, float addrspace(1)* %out + ret void +} + +; COMMON: 'sitofp_i32_to_f64' +; COMMON: estimated cost of 2 for {{.*}} sitofp i32 %val to double +define void @sitofp_i32_to_f64(double addrspace(1)* %out, i32 %val) #0 { + %cvt = sitofp i32 %val to double + store double %cvt, double addrspace(1)* %out + ret void +} + +; COMMON: 'sitofp_i64_to_f64' +; COMMON: estimated cost of 8 for {{.*}} sitofp i64 %val to double +define void @sitofp_i64_to_f64(double addrspace(1)* %out, i64 %val) #0 { + %cvt = sitofp i64 %val to double + store double %cvt, double addrspace(1)* %out + ret void +} + +; COMMON: 'sitofp_v3i64_to_v3f64' +; COMMON: estimated cost of 24 for {{.*}} sitofp <3 x i64> %val to <3 x double> +define void @sitofp_v3i64_to_v3f64(<3 x double> addrspace(1)* %out, <3 x i64> %val) #0 { + %cvt = sitofp <3 x i64> %val to <3 x double> + store <3 x double> %cvt, <3 x double> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind }