Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -422,6 +422,8 @@
     bool AllowPeeling;
     /// Allow unrolling of all the iterations of the runtime loop remainder.
     bool UnrollRemainder;
+    /// Allow unrolling convergent loop with remainder.
+    bool AllowRemainderForConvergentLoop;
   };
 
   /// \brief Get target-customized preferences for the generic loop unrolling
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -93,6 +93,7 @@
   UP.Threshold = 300; // Twice the default.
   UP.MaxCount = std::numeric_limits<unsigned>::max();
   UP.Partial = true;
+  UP.AllowRemainderForConvergentLoop = true;
 
   // TODO: Do we want runtime unrolling?
 
Index: lib/Transforms/Scalar/LoopUnrollPass.cpp
===================================================================
--- lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -191,6 +191,7 @@
   UP.Force = false;
   UP.UpperBound = false;
   UP.AllowPeeling = true;
+  UP.AllowRemainderForConvergentLoop = false;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, SE, UP);
@@ -1015,7 +1016,7 @@
   // Assuming n is the same on all threads, any kind of unrolling is
   // safe.  But currently llvm's notion of convergence isn't powerful
   // enough to express this.
-  if (Convergent)
+  if (Convergent && !UP.AllowRemainderForConvergentLoop)
     UP.AllowRemainder = false;
 
   // Try to find the trip count upper bound if we cannot find the exact trip
Index: test/Transforms/LoopUnroll/AMDGPU/convergent.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopUnroll/AMDGPU/convergent.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -loop-unroll -S | FileCheck %s
+
+target triple = "amdgcn"
+
+declare void @f() convergent
+
+; This loop contains a convergent instruction. Since AMDGPU target transform
+; info allows unrolling loop with convergent instruction with remainders,
+; the loop is unrolled following its pragma unroll value 2, instead of
+; doing a full unroll.
+
+define void @pragma_unroll() {
+entry:
+  br label %l3, !llvm.loop !0
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK-NOT: call void @f()
+  call void @f() convergent
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 4
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !0
+
+exit:
+  ret void
+
+}
+
+!0 = !{!0, !{!"llvm.loop.unroll.count", i32 2}}
Index: test/Transforms/LoopUnroll/convergent.ll
===================================================================
--- test/Transforms/LoopUnroll/convergent.ll
+++ test/Transforms/LoopUnroll/convergent.ll
@@ -80,4 +80,29 @@
   ret i32 0
 }
 
+; This loop contains a convergent instruction, so allow remainder
+; is disabled. This overrides its unroll pragma -- we unroll 4 times,
+; even though 2 is requested.
+; CHECK-LABEL: @pragma_unroll2
+define void @pragma_unroll2() {
+entry:
+  br label %l3, !llvm.loop !0
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK: call void @f()
+  call void @f() convergent
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 4
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  ret void
+
+}
+
 !0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}}
+!1 = !{!0, !{!"llvm.loop.unroll.count", i32 2}}