diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -784,6 +784,9 @@
     return false;
   }
 
+  if (isAlwaysUniform(I))
+    return false;
+
   return markDefsDivergent(I);
 }
 
@@ -952,10 +955,6 @@
     if (I.isTerminator())
       break;
 
-    // Mark this as divergent. We don't check if the instruction is
-    // always uniform. In a cycle where the thread convergence is not
-    // statically known, the instruction is not statically converged,
-    // and its outputs cannot be statically uniform.
     if (markDivergent(I))
       Worklist.push_back(&I);
   }
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -48,13 +48,10 @@
 void llvm::GenericUniformityAnalysisImpl<SSAContext>::pushUsers(
     const Value *V) {
   for (const auto *User : V->users()) {
-    const auto *UserInstr = dyn_cast<const Instruction>(User);
-    if (!UserInstr)
-      continue;
-    if (isAlwaysUniform(*UserInstr))
-      continue;
-    if (markDivergent(*UserInstr)) {
-      Worklist.push_back(UserInstr);
+    if (const auto *UserInstr = dyn_cast<const Instruction>(User)) {
+      if (markDivergent(*UserInstr)) {
+        Worklist.push_back(UserInstr);
+      }
     }
   }
 }
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -75,8 +75,6 @@
     Register Reg) {
   const auto &RegInfo = F.getRegInfo();
   for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
-    if (isAlwaysUniform(UserInstr))
-      continue;
     if (markDivergent(UserInstr))
       Worklist.push_back(&UserInstr);
   }
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
@@ -569,6 +569,37 @@
   ret void
 }
 
+define amdgpu_kernel void @always_uniform() {
+; CHECK-LABEL: UniformityInfo for function 'always_uniform':
+; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK:   depth=1: entries(bb2 bb3)
+
+bb:
+  %inst = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
+  %inst1 = icmp ugt i32 %inst, 0
+  br i1 %inst1, label %bb3, label %bb2
+; CHECK:   DIVERGENT:   %inst = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
+; CHECK:   DIVERGENT:   %inst1 = icmp ugt i32 %inst, 0
+; CHECK:   DIVERGENT:   br i1 %inst1, label %bb3, label %bb2
+
+bb2:                                              ; preds = %bb3, %bb
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  %inst4 = tail call i64 @llvm.amdgcn.icmp.i64.i16(i16 0, i16 0, i32 0)
+  %inst5 = trunc i64 %inst4 to i32
+  %inst6 = and i32 0, %inst5
+  br label %bb2
+; CHECK-LABEL: BLOCK bb3
+; CHECK-NOT: DIVERGENT: {{.*}} call i64 @llvm.amdgcn.icmp.i64.i16
+; CHECK:   DIVERGENT:   %inst5 = trunc i64 %inst4 to i32
+; CHECK:   DIVERGENT:   %inst6 = and i32 0, %inst5
+}
+
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
+
+declare i64 @llvm.amdgcn.icmp.i64.i16(i16, i16, i32 immarg)
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 attributes #0 = { nounwind readnone }