Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -984,8 +984,8 @@
             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
       }
     }
-    // End of for loop that looks at all source operands to decide vm_wait_cnt
-    // and lgk_wait_cnt.
+    // End of for loop that looks at all source operands to decide vm_cnt
+    // and lgkm_cnt.
 
     // Two cases are handled for destination operands:
     // 1) If the destination operand was defined by a load, add the s_waitcnt
@@ -1263,12 +1263,23 @@
   // they will need to retain and not clear their initial state.
 
   // See if there are any uninitialized predecessors. If so, emit an
-  // s_waitcnt 0 at the beginning of the block.
-  for (MachineBasicBlock *pred : Block.predecessors()) {
+  // s_waitcnt 0 at the beginning of the block if we're sure
+  // that we will *not* revisit the block.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[pred].get();
-    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
-    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+        BlockWaitcntBracketsMap[Pred].get();
+    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+    if (!Visited && !ScoreBrackets->getRevisitLoop()){
+      // Pred not visited, so there better be no PredScoreBracket.
+      // Unfortunately, emit a s_waitcnt 0 since we don't have
+      // Pred info and we're not going to get it.
+      // FIXME: can we revisit this block after we've visited
+      // the Pred?
+      assert(!PredScoreBrackets);
+      ScoreBrackets->setWaitAtBeginning();
+      break;
+    }
+    if (!PredScoreBrackets || PredScoreBrackets->getWaitAtBeginning()) {
       break;
     }
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@@ -1305,9 +1316,19 @@
   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
+      BlockWaitcntBracketsMap[Pred].get();
     bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
-    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+    if (!Visited && !ScoreBrackets->getRevisitLoop()){
+      // Pred not visited, so there better be no PredScoreBracket.
+      // Unfortunately, emit a s_waitcnt 0 since we don't have
+      // Pred info and we're not going to get it.
+      // FIXME: can we revisit this block after we've visited
+      // the Pred?
+      assert(!PredScoreBrackets);
+      ScoreBrackets->setWaitAtBeginning();
+      break;
+    }
+    if (!PredScoreBrackets || PredScoreBrackets->getWaitAtBeginning()) {
       break;
     }
 
Index: test/CodeGen/AMDGPU/waitcnt-no-preds.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/waitcnt-no-preds.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; check that the waitcnt pass inserts a S_WAITCNT 0 at the top of a
+; block when the preds will never be visited prior to the block
+
+; CHECK: {{BB[0-9]_3:}}
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK: {{BB[0-9]_4:}}
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+
+define amdgpu_ps void @main(i32 inreg %arg, <2 x float> %arg1) local_unnamed_addr {
+bb:
+  %tmp = insertelement <2 x i32> <i32 undef, i32 1>, i32 %arg, i32 0
+  %tmp2 = bitcast <2 x i32> %tmp to i64
+  %tmp3 = inttoptr i64 %tmp2 to [4294967295 x i8] addrspace(2)*
+  %tmp4 = extractelement <2 x float> %arg1, i32 1
+  %tmp5 = insertelement <4 x float> undef, float %tmp4, i32 0
+  %tmp6 = inttoptr i64 %tmp2 to <4 x i32> addrspace(2)*
+  %tmp7 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp6, align 16
+  %tmp8 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp7, i32 0, i32 0, i1 false, i1 false)
+  %tmp9 = bitcast float %tmp8 to i32
+  %tmp10 = icmp eq i32 %tmp9, 0
+  %tmp11 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %tmp3, i64 0, i64 64
+  %tmp12 = bitcast i8 addrspace(2)* %tmp11 to <4 x i32> addrspace(2)*
+  br label %bb13
+
+bb13:                                             ; preds = %bb22, %bb
+  %tmp14 = phi <4 x float> [ %tmp5, %bb ], [ %tmp23, %bb22 ]
+  %tmp15 = phi i32 [ 0, %bb ], [ %tmp24, %bb22 ]
+  br i1 %tmp10, label %bb19, label %bb16
+
+bb16:                                             ; preds = %bb13
+  %tmp17 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp7, i32 0, i32 0, i1 false, i1 false)
+  %tmp18 = bitcast float %tmp17 to i32
+  br label %bb19
+
+bb19:                                             ; preds = %bb16, %bb13
+  %tmp20 = phi i32 [ %tmp18, %bb16 ], [ 0, %bb13 ]
+  %tmp21 = icmp slt i32 %tmp15, %tmp20
+  br i1 %tmp21, label %bb22, label %bb25
+
+bb22:                                             ; preds = %bb19
+  %tmp23 = shufflevector <4 x float> %tmp14, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %tmp24 = add i32 %tmp15, 1
+  br label %bb13
+
+bb25:                                             ; preds = %bb19
+  ret void
+}
+
+
+
+; Function Attrs: nounwind readonly
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1)