Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -422,6 +422,8 @@ bool AllowPeeling; /// Allow unrolling of all the iterations of the runtime loop remainder. bool UnrollRemainder; + /// Allow unrolling convergent loop with remainder. + bool AllowRemainderForConvergentLoop; }; /// \brief Get target-customized preferences for the generic loop unrolling Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -93,6 +93,7 @@ UP.Threshold = 300; // Twice the default. UP.MaxCount = std::numeric_limits::max(); UP.Partial = true; + UP.AllowRemainderForConvergentLoop = true; // TODO: Do we want runtime unrolling? Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -191,6 +191,7 @@ UP.Force = false; UP.UpperBound = false; UP.AllowPeeling = true; + UP.AllowRemainderForConvergentLoop = false; // Override with any target specific settings TTI.getUnrollingPreferences(L, SE, UP); @@ -1015,7 +1016,7 @@ // Assuming n is the same on all threads, any kind of unrolling is // safe. But currently llvm's notion of convergence isn't powerful // enough to express this. - if (Convergent) + if (Convergent && !UP.AllowRemainderForConvergentLoop) UP.AllowRemainder = false; // Try to find the trip count upper bound if we cannot find the exact trip Index: test/Transforms/LoopUnroll/AMDGPU/convergent.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/AMDGPU/convergent.ll @@ -0,0 +1,31 @@ +; RUN: opt < %s -loop-unroll -S | FileCheck %s + +target triple = "amdgcn" + +declare void @f() convergent + +; This loop contains a convergent instruction. Since AMDGPU target transform +; info allows unrolling loop with convergent instruction with remainders, +; the loop is unrolled following its pragma unroll value 2, instead of +; doing a full unroll. + +define void @pragma_unroll() { +entry: + br label %l3, !llvm.loop !0 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK-NOT: call void @f() + call void @f() convergent + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br i1 %exitcond, label %exit, label %l3, !llvm.loop !0 + +exit: + ret void + +} + +!0 = !{!0, !{!"llvm.loop.unroll.count", i32 2}} Index: test/Transforms/LoopUnroll/convergent.ll =================================================================== --- test/Transforms/LoopUnroll/convergent.ll +++ test/Transforms/LoopUnroll/convergent.ll @@ -80,4 +80,29 @@ ret i32 0 } +; This loop contains a convergent instruction, so allow remainder +; is disabled. This overrides its unroll pragma -- we unroll 4 times, +; even though 2 is requested. +; CHECK-LABEL: @pragma_unroll2 +define void @pragma_unroll2() { +entry: + br label %l3, !llvm.loop !0 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() +; CHECK: call void @f() + call void @f() convergent + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret void + +} + !0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}} +!1 = !{!0, !{!"llvm.loop.unroll.count", i32 2}}