diff --git a/llvm/docs/ConvergenceAndUniformity.rst b/llvm/docs/ConvergenceAndUniformity.rst
--- a/llvm/docs/ConvergenceAndUniformity.rst
+++ b/llvm/docs/ConvergenceAndUniformity.rst
@@ -393,15 +393,19 @@
 
 1. The semantics of the instruction may specify the output to be
    uniform.
-2. Otherwise, if it is a PHI node, its output is uniform if and only
-   if for every pair of converged dynamic instances produced by all
-   threads in ``S``:
+2. Otherwise, the output is divergent if the static instance is not
+   :ref:`m-converged <convergence-m-converged>`.
+3. Otherwise, if the static instance is m-converged:
 
-   a. Both instances choose the same output from converged
-      dynamic instances, and,
-   b. That output is uniform for all threads in ``S``.
-3. Otherwise, the output is uniform if and only if the input
-   operands are uniform for all threads in ``S``.
+   1. If it is a PHI node, its output is uniform if and only
+      if for every pair of converged dynamic instances produced by all
+      threads in ``S``:
+
+      a. Both instances choose the same output from converged
+         dynamic instances, and,
+      b. That output is uniform for all threads in ``S``.
+   2. Otherwise, the output is uniform if and only if the input
+      operands are uniform for all threads in ``S``.
 
 Divergent Cycle Exits
 ---------------------
@@ -433,6 +437,8 @@
 those static instances whose convergence is independent of the cycle
 hierarchy:
 
+.. _convergence-m-converged:
+
   **m-converged static instances:**
 
   A static instance ``X`` is *m-converged* for a given CFG if and only
@@ -474,9 +480,8 @@
    if the whole CFG is reducible, then all nodes in the CFG are
    m-converged.
 
-If a static instance is not m-converged, then every output is assumed
-to be divergent. Otherwise, for an m-converged static instance, the
-uniformity of each output is determined using the criteria
+The uniformity of each output of a static instance
+is determined using the criteria
 :ref:`described earlier <convergence-uniformity>`. The discovery of
 divergent outputs may cause their uses (including branches) to also
 become divergent. The analysis propagates this divergence until a
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -784,6 +784,9 @@
     return false;
   }
 
+  if (isAlwaysUniform(I))
+    return false;
+
   return markDefsDivergent(I);
 }
 
@@ -952,10 +955,6 @@
     if (I.isTerminator())
       break;
 
-    // Mark this as divergent. We don't check if the instruction is
-    // always uniform. In a cycle where the thread convergence is not
-    // statically known, the instruction is not statically converged,
-    // and its outputs cannot be statically uniform.
     if (markDivergent(I))
       Worklist.push_back(&I);
   }
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -48,13 +48,10 @@
 void llvm::GenericUniformityAnalysisImpl<SSAContext>::pushUsers(
     const Value *V) {
   for (const auto *User : V->users()) {
-    const auto *UserInstr = dyn_cast<const Instruction>(User);
-    if (!UserInstr)
-      continue;
-    if (isAlwaysUniform(*UserInstr))
-      continue;
-    if (markDivergent(*UserInstr)) {
-      Worklist.push_back(UserInstr);
+    if (const auto *UserInstr = dyn_cast<const Instruction>(User)) {
+      if (markDivergent(*UserInstr)) {
+        Worklist.push_back(UserInstr);
+      }
     }
   }
 }
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -75,8 +75,6 @@
     Register Reg) {
   const auto &RegInfo = F.getRegInfo();
   for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
-    if (isAlwaysUniform(UserInstr))
-      continue;
     if (markDivergent(UserInstr))
       Worklist.push_back(&UserInstr);
   }
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
@@ -569,6 +569,37 @@
   ret void
 }
 
+define amdgpu_kernel void @always_uniform() {
+; CHECK-LABEL: UniformityInfo for function 'always_uniform':
+; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK:   depth=1: entries(bb2 bb3)
+
+bb:
+  %inst = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
+  %inst1 = icmp ugt i32 %inst, 0
+  br i1 %inst1, label %bb3, label %bb2
+; CHECK:   DIVERGENT:   %inst = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
+; CHECK:   DIVERGENT:   %inst1 = icmp ugt i32 %inst, 0
+; CHECK:   DIVERGENT:   br i1 %inst1, label %bb3, label %bb2
+
+bb2:                                              ; preds = %bb3, %bb
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  %inst4 = tail call i64 @llvm.amdgcn.icmp.i64.i16(i16 0, i16 0, i32 0)
+  %inst5 = trunc i64 %inst4 to i32
+  %inst6 = and i32 0, %inst5
+  br label %bb2
+; CHECK-LABEL: BLOCK bb3
+; CHECK-NOT: DIVERGENT: {{.*}} call i64 @llvm.amdgcn.icmp.i64.i16
+; CHECK:   DIVERGENT:   %inst5 = trunc i64 %inst4 to i32
+; CHECK:   DIVERGENT:   %inst6 = and i32 0, %inst5
+}
+
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
+
+declare i64 @llvm.amdgcn.icmp.i64.i16(i16, i16, i32 immarg)
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 attributes #0 = { nounwind readnone }