Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -515,8 +515,13 @@ // Can't properly model a cost of a call. // FIXME: With a proper cost model we should be able to do it. - if(isa(&I)) - return None; + if (auto *CI = dyn_cast(&I)) { + const Function *Callee = CI->getCalledFunction(); + if (!Callee || TTI.isLoweredToCall(Callee)) { + LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n"); + return None; + } + } // If the instruction might have a side-effect recursively account for // the cost of it and all the instructions leading up to it. Index: test/Transforms/LoopUnroll/AMDGPU/unroll-cost-call.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/AMDGPU/unroll-cost-call.ll @@ -0,0 +1,78 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -loop-unroll -unroll-threshold=100 -unroll-peel-count=0 -unroll-allow-partial=false -unroll-max-iteration-count-to-analyze=16 < %s | FileCheck %s + +; CHECK-LABEL: @test_intrinsic_call_cost( +; CHECK-NOT: br i1 +define amdgpu_kernel void @test_intrinsic_call_cost(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture %in) #0 { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ] + %arrayidx.in = getelementptr inbounds float, float addrspace(1)* %in, i32 %indvars.iv + %arrayidx.out = getelementptr inbounds float, float addrspace(1)* %out, i32 %indvars.iv + %load = load float, float addrspace(1)* %arrayidx.in + %call = call float @llvm.minnum.f32(float %load, float 1.0); + %fmul = fmul float %call, %sum.02 + store float %fmul, float addrspace(1)* %arrayidx.out + %indvars.iv.next = add i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, 16 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +; CHECK-LABEL: @test_func_call_cost( +; CHECK: br i1 %exitcond +define amdgpu_kernel void @test_func_call_cost(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture %in) #0 { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ] + %arrayidx.in = getelementptr inbounds float, float addrspace(1)* %in, i32 %indvars.iv + %arrayidx.out = getelementptr inbounds float, float addrspace(1)* %out, i32 %indvars.iv + %load = load float, float addrspace(1)* %arrayidx.in + %fptr = load float(float, float)*, float(float, float )* addrspace(4)* null + %call = tail call float %fptr(float %load, float 1.0) + %fmul = fmul float %call, %sum.02 + store float %fmul, float addrspace(1)* %arrayidx.out + %indvars.iv.next = add i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, 16 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + + +; CHECK-LABEL: @test_indirect_call_cost( +; CHECK: br i1 %exitcond +define amdgpu_kernel void @test_indirect_call_cost(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture %in) #0 { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ] + %arrayidx.in = getelementptr inbounds float, float addrspace(1)* %in, i32 %indvars.iv + %arrayidx.out = getelementptr inbounds float, float addrspace(1)* %out, i32 %indvars.iv + %load = load float, float addrspace(1)* %arrayidx.in + %min = call float @func(float %load, float 1.0); + %fmul = fmul float %min, %sum.02 + store float %fmul, float addrspace(1)* %arrayidx.out + %indvars.iv.next = add i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, 16 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +declare float @llvm.minnum.f32(float, float) #1 +declare float @func(float, float) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable }