Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -785,16 +785,27 @@ if (getUserCost(I, Operands) == TTI::TCC_Free) return 0; - if (isa(I)) - return 40; - if (isa(I)) return 4; - Type *dstTy = I->getType(); - if (VectorType *VectorTy = dyn_cast(dstTy)) - dstTy = VectorTy->getElementType(); - if (dstTy->isFloatingPointTy()) + Type *DstTy = I->getType(); + + // Usually an intrinsic is a simple instruction. + // A real function call is much slower. + if (auto *CI = dyn_cast(I)) { + const Function *F = CI->getCalledFunction(); + if (static_cast(this)->isLoweredToCall(F)) + return 40; + // Some intrinsics return a value and a flag, we use the value type + // to decide its latency. + if (StructType* StructTy = dyn_cast(DstTy)) + DstTy = StructTy->getElementType(0); + // Fall through to simple instructions. + } + + if (VectorType *VectorTy = dyn_cast(DstTy)) + DstTy = VectorTy->getElementType(); + if (DstTy->isFloatingPointTy()) return 3; return 1; Index: llvm/trunk/test/Analysis/CostModel/X86/costmodel.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/costmodel.ll +++ llvm/trunk/test/Analysis/CostModel/X86/costmodel.ll @@ -5,6 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) + define i64 @foo(i64 %arg) { ; LATENCY: cost of 0 {{.*}} alloca i32 @@ -39,6 +41,10 @@ ; CODESIZE: cost of 0 {{.*}} trunc %TC = trunc i64 undef to i32 + ; LATENCY: cost of 1 {{.*}} call + ; CODESIZE: cost of 1 {{.*}} call + %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef) + ; LATENCY: cost of 1 {{.*}} ret ; CODESIZE: cost of 1 {{.*}} ret ret i64 undef