Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -28,18 +28,6 @@ #define DEBUG_TYPE "AMDGPUtti" -static const int FullRateCost = TargetTransformInfo::TCC_Basic; -static const int HalfRateCost = 2 * TargetTransformInfo::TCC_Basic; - -// TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe should -// be 2 or 4. -static const int QuarterRateCost = 3 * TargetTransformInfo::TCC_Basic; - -// TODO: On some parts, normal fp64 operations are half rate, and others -// quarter. -static const int FP64RateCost = HalfRateCost; - - void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { UP.Threshold = 300; // Twice the default. @@ -102,31 +90,30 @@ case Intrinsic::fma: { if (VT == MVT::f32 || VT == MVT::f16) { if (ST.hasFastFMAF32()) - return FullRateCost; + return TargetTransformInfo::TCC_Basic; - return FP64RateCost; + return 3 * TargetTransformInfo::TCC_Basic; } - return QuarterRateCost; + return 3 * TargetTransformInfo::TCC_Basic; } case Intrinsic::floor: { if (VT == MVT::f32 || VT == MVT::f16) - return FullRateCost; + return TargetTransformInfo::TCC_Basic; if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) - return FP64RateCost; + return 2 * TargetTransformInfo::TCC_Basic; - return getIntrinsicCost(ST, VT, Intrinsic::trunc) + - 3 * FullRateCost + 3 * HalfRateCost; + return getIntrinsicCost(ST, VT, Intrinsic::trunc) + 7; } case Intrinsic::trunc: { if (VT == MVT::f32 || VT == MVT::f16) - return FullRateCost; + return TargetTransformInfo::TCC_Basic; if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) - return FP64RateCost; + return 2 * TargetTransformInfo::TCC_Basic; - return 16 * FullRateCost + 1 * HalfRateCost; + return 15; } default: return -1; @@ -160,10 +147,10 @@ case ISD::FSUB: case ISD::FMUL: if (SLT == MVT::f64) - return LT.first * NElts * FP64RateCost; + return 2 * LT.first * NElts; if (SLT == MVT::f32 || SLT == MVT::f16) - return LT.first * NElts * FullRateCost; + return LT.first * NElts; break; case ISD::FDIV: @@ -171,9 +158,10 @@ // FIXME: frem should be handled separately. The fdiv in it is most of it, // but the current lowering is also not entirely correct. if (SLT == MVT::f64) { - int Cost = 4 * FP64RateCost + 7 * QuarterRateCost; + int Cost = 24; + if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) - Cost += 3 * FullRateCost; + Cost += 3; return LT.first * Cost * NElts; } @@ -181,8 +169,7 @@ // Assuming no fp32 denormals lowering if (SLT == MVT::f32 || SLT == MVT::f16) { assert(!ST->hasFP32Denormals() && "will change when supported"); - int Cost = 7 * FullRateCost + 1 * QuarterRateCost; - return LT.first * NElts * Cost; + return 6 * LT.first * NElts; } break; @@ -229,7 +216,7 @@ return BaseT::getCastInstrCost(Opcode, Dst, Src); } else { // f32 -> i32 full rate instruction. - Cost += FullRateCost; + Cost += 1; } } else { assert(SSrcLT == MVT::f64); @@ -239,12 +226,10 @@ Cost += ::getIntrinsicCost(*ST, SSrcLT, Intrinsic::trunc); Cost += ::getIntrinsicCost(*ST, SSrcLT, Intrinsic::floor); Cost += ::getIntrinsicCost(*ST, SSrcLT, Intrinsic::fma); - Cost += 4 * FullRateCost; - Cost += 1 * QuarterRateCost; - Cost += 3 * FP64RateCost; + Cost += 6; } else { // f64 -> i32 half or quarter rate instruction. - Cost += FP64RateCost; + Cost += 2; } } @@ -260,15 +245,19 @@ return BaseT::getCastInstrCost(Opcode, Dst, Src); } else { // i32 -> f32 full rate instruction. - Cost = FullRateCost; + Cost = TargetTransformInfo::TCC_Basic; } } else { // i64 to f64 expansion if (SSrcLT == MVT::i64) { - Cost = 4 * FP64RateCost; + // [su]int_to_fp (half or full) + // uint_to_fp (half or full) + // ldexp (half or full) + // fadd (half or full) + Cost = 2 + 2 + 2 + 2; } else { // i32 -> f64 half or quarter rate instruction. - Cost = FP64RateCost; + Cost = 2; } } Index: test/Analysis/CostModel/AMDGPU/fdiv.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fdiv.ll +++ test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -2,7 +2,7 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefix=COMMON -check-prefix=SI %s ; CHECK: 'fdiv_f32' -; COMMON: estimated cost of 10 for {{.*}} fdiv float +; COMMON: estimated cost of 6 for {{.*}} fdiv float define void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { %vec = load float, float addrspace(1)* %vaddr %add = fdiv float %vec, %b @@ -11,7 +11,7 @@ } ; COMMON: 'fdiv_v2f32' -; COMMON: estimated cost of 20 for {{.*}} fdiv <2 x float> +; COMMON: estimated cost of 12 for {{.*}} fdiv <2 x float> define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fdiv <2 x float> %vec, %b @@ -20,7 +20,7 @@ } ; COMMON: 'fdiv_v3f32' -; COMMON: estimated cost of 30 for {{.*}} fdiv <3 x float> +; COMMON: estimated cost of 18 for {{.*}} fdiv <3 x float> define void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fdiv <3 x float> %vec, %b @@ -29,8 +29,8 @@ } ; COMMON: 'fdiv_f64' -; CI: estimated cost of 29 for {{.*}} fdiv double -; SI: estimated cost of 32 for {{.*}} fdiv double +; CI: estimated cost of 24 for {{.*}} fdiv double +; SI: estimated cost of 27 for {{.*}} fdiv double define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fdiv double %vec, %b @@ -39,8 +39,8 @@ } ; COMMON: 'fdiv_v2f64' -; CI: estimated cost of 58 for {{.*}} fdiv <2 x double> -; SI: estimated cost of 64 for {{.*}} fdiv <2 x double> +; CI: estimated cost of 48 for {{.*}} fdiv <2 x double> +; SI: estimated cost of 54 for {{.*}} fdiv <2 x double> define void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fdiv <2 x double> %vec, %b @@ -49,8 +49,8 @@ } ; COMMON: 'fdiv_v3f64' -; CI: estimated cost of 87 for {{.*}} fdiv <3 x double> -; SI: estimated cost of 96 for {{.*}} fdiv <3 x double> +; CI: estimated cost of 72 for {{.*}} fdiv <3 x double> +; SI: estimated cost of 81 for {{.*}} fdiv <3 x double> define void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fdiv <3 x double> %vec, %b @@ -59,7 +59,7 @@ } ; COMMON: 'fdiv_f16' -; COMMON: estimated cost of 10 for {{.*}} fdiv half +; COMMON: estimated cost of 6 for {{.*}} fdiv half define void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { %vec = load half, half addrspace(1)* %vaddr %add = fdiv half %vec, %b @@ -68,7 +68,7 @@ } ; COMMON: 'fdiv_v2f16' -; COMMON: estimated cost of 20 for {{.*}} fdiv <2 x half> +; COMMON: estimated cost of 12 for {{.*}} fdiv <2 x half> define void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fdiv <2 x half> %vec, %b @@ -77,7 +77,7 @@ } ; COMMON: 'fdiv_v4f16' -; COMMON: estimated cost of 40 for {{.*}} fdiv <4 x half> +; COMMON: estimated cost of 24 for {{.*}} fdiv <4 x half> define void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr %add = fdiv <4 x half> %vec, %b Index: test/Analysis/CostModel/AMDGPU/fptosi.ll =================================================================== --- test/Analysis/CostModel/AMDGPU/fptosi.ll +++ test/Analysis/CostModel/AMDGPU/fptosi.ll @@ -35,8 +35,7 @@ } ; COMMON: 'fptosi_f64_to_i64' -; SI: estimated cost of 61 for {{.*}} fptosi double %val to i64 -; CI: estimated cost of 20 for {{.*}} fptosi double %val to i64 +; SI: estimated cost of 46 for {{.*}} fptosi double %val to i64 define void @fptosi_f64_to_i64(i64 addrspace(1)* %out, double %val) #0 { %cvt = fptosi double %val to i64 store i64 %cvt, i64 addrspace(1)* %out @@ -44,8 +43,8 @@ } ; COMMON: 'fptosi_v3f64_to_v3i64' -; SI: estimated cost of 183 for {{.*}} fptosi <3 x double> %val to <3 x i64> -; CI: estimated cost of 60 for {{.*}} fptosi <3 x double> %val to <3 x i64> +; SI: estimated cost of 138 for {{.*}} fptosi <3 x double> %val to <3 x i64> +; CI: estimated cost of 39 for {{.*}} fptosi <3 x double> %val to <3 x i64> define void @fptosi_v3f64_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x double> %val) #0 { %cvt = fptosi <3 x double> %val to <3 x i64> store <3 x i64> %cvt, <3 x i64> addrspace(1)* %out