Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -984,8 +984,8 @@
             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
       }
     }
-    // End of for loop that looks at all source operands to decide vm_wait_cnt
-    // and lgk_wait_cnt.
+    // End of for loop that looks at all source operands to decide vm_cnt
+    // and lgkm_cnt.
 
     // Two cases are handled for destination operands:
     // 1) If the destination operand was defined by a load, add the s_waitcnt
@@ -1263,12 +1263,23 @@
   // they will need to retain and not clear their initial state.
 
   // See if there are any uninitialized predecessors. If so, emit an
-  // s_waitcnt 0 at the beginning of the block.
-  for (MachineBasicBlock *pred : Block.predecessors()) {
+  // s_waitcnt 0 at the beginning of the block if we're sure
+  // that we will *not* revisit the block.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[pred].get();
-    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
-    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+        BlockWaitcntBracketsMap[Pred].get();
+    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+    if (!Visited && !ScoreBrackets->getRevisitLoop()){
+      // pred not visited, so there better be no PredScoreBracket.
+      // Unfortunately, emit a s_waitcnt 0 since we don't have
+      // pred info and we're not going to get it.
+      // FIXME: can we revisit this block after we've visited
+      // the pred?
+      assert(!PredScoreBrackets);
+      ScoreBrackets->setWaitAtBeginning();
+      break;
+    }
+    if (!PredScoreBrackets || PredScoreBrackets->getWaitAtBeginning()) {
       break;
     }
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@@ -1305,9 +1316,19 @@
   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
-    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+      BlockWaitcntBracketsMap[Pred].get();
+    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+    if (!Visited && !ScoreBrackets->getRevisitLoop()){
+      // pred not visited, so there better be no PredScoreBracket.
+      // Unfortunately, emit a s_waitcnt 0 since we don't have
+      // pred info and we're not going to get it.
+      // FIXME: can we revisit this block after we've visited
+      // the pred?
+      assert(!PredScoreBrackets);
+      ScoreBrackets->setWaitAtBeginning();
+      break;
+    }
+    if (!PredScoreBrackets || PredScoreBrackets->getWaitAtBeginning()) {
       break;
     }
 
Index: test/CodeGen/AMDGPU/waitcnt-no-preds.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/waitcnt-no-preds.ll
@@ -0,0 +1,106 @@
+; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; check that the waitcnt pass inserts a S_WAITCNT 0 at the top of a
+; block when the preds will never be visited prior to the block
+
+; CHECK-LABEL: BB0_3:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-LABEL: BB0_4:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+
+; Function Attrs: nounwind
+define amdgpu_ps void @main(i32 inreg, i32 inreg, i32 inreg, i32 inreg, <2 x float>, <2 x float>, <2 x float>, <3 x float>, <2 x float>, <2 x float>, <2 x float>, float, float, float, float, float, i32, i32, i32, i32) local_unnamed_addr #0 !spirv.ExecutionModel !6 {
+  %21 = insertelement <2 x i32> <i32 undef, i32 1>, i32 %2, i32 0
+  %22 = bitcast <2 x i32> %21 to i64
+  %23 = inttoptr i64 %22 to [4294967295 x i8] addrspace(2)*
+  %24 = extractelement <2 x float> %5, i32 0
+  %25 = extractelement <2 x float> %5, i32 1
+  %26 = call float @llvm.amdgcn.interp.p1(float %24, i32 0, i32 0, i32 %3) #2
+  %27 = call float @llvm.amdgcn.interp.p2(float %26, float %25, i32 0, i32 0, i32 %3) #2
+  %28 = insertelement <4 x float> undef, float %27, i32 0
+  %29 = call float @llvm.amdgcn.interp.p1(float %24, i32 1, i32 0, i32 %3) #2
+  %30 = call float @llvm.amdgcn.interp.p2(float %29, float %25, i32 1, i32 0, i32 %3) #2
+  %31 = insertelement <4 x float> %28, float %30, i32 1
+  %32 = call float @llvm.amdgcn.interp.p1(float %24, i32 2, i32 0, i32 %3) #2
+  %33 = call float @llvm.amdgcn.interp.p2(float %32, float %25, i32 2, i32 0, i32 %3) #2
+  %34 = insertelement <4 x float> %31, float %33, i32 2
+  %35 = call float @llvm.amdgcn.interp.p1(float %24, i32 3, i32 0, i32 %3) #2
+  %36 = call float @llvm.amdgcn.interp.p2(float %35, float %25, i32 3, i32 0, i32 %3) #2
+  %37 = insertelement <4 x float> %34, float %36, i32 3
+  %38 = inttoptr i64 %22 to <4 x i32> addrspace(2)*
+  %39 = load <4 x i32>, <4 x i32> addrspace(2)* %38, align 16
+  %40 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %39, i32 0, i32 0, i1 false, i1 false) #3
+  %41 = bitcast float %40 to i32
+  %42 = icmp eq i32 %41, 0
+  %43 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %23, i64 0, i64 64
+  %44 = bitcast i8 addrspace(2)* %43 to <4 x i32> addrspace(2)*
+  br label %45
+
+; <label>:45:                                     ; preds = %55, %20
+  %46 = phi <4 x float> [ %37, %20 ], [ %56, %55 ]
+  %47 = phi i32 [ 0, %20 ], [ %57, %55 ]
+  br i1 %42, label %52, label %48
+
+; <label>:48:                                     ; preds = %45
+  %49 = load <4 x i32>, <4 x i32> addrspace(2)* %44, align 16
+  %50 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %49, i32 0, i32 0, i1 false, i1 false) #3
+  %51 = bitcast float %50 to i32
+  br label %52
+
+; <label>:52:                                     ; preds = %48, %45
+  %53 = phi i32 [ %51, %48 ], [ 0, %45 ]
+  %54 = icmp slt i32 %47, %53
+  br i1 %54, label %55, label %58
+
+; <label>:55:                                     ; preds = %52
+  %56 = shufflevector <4 x float> %46, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %57 = add i32 %47, 1
+  br label %45
+
+; <label>:58:                                     ; preds = %52
+  %59 = extractelement <4 x float> %46, i32 0
+  %60 = extractelement <4 x float> %46, i32 1
+  %61 = extractelement <4 x float> %46, i32 2
+  %62 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %59, float %60) #2
+  %63 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %61, float 1.000000e+00) #2
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %62, <2 x half> %63, i1 true, i1 true) #3
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1
+
+; Function Attrs: nounwind readnone speculatable
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
+
+; Function Attrs: nounwind readnone speculatable
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
+
+; Function Attrs: nounwind readnone speculatable
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #2
+
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #3
+
+attributes #0 = { nounwind "InitialPSInputAddr"="2" "amdgpu-git-ptr-high"="1" }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { nounwind }
+
+!opencl.kernels = !{}
+!spirv.EntryPoints = !{!0}
+!opencl.enable.FP_CONTRACT = !{}
+!spirv.Source = !{!2}
+!opencl.spir.version = !{!3}
+!opencl.ocl.version = !{!3}
+!opencl.used.extensions = !{!4}
+!opencl.used.optional.core.features = !{!4}
+!spirv.Generator = !{!5}
+
+!0 = distinct !{null, !1}
+!1 = !{!"spirv.ExecutionMode.Fragment", i32 1, i32 0, i32 0}
+!2 = !{i32 3, i32 102000}
+!3 = !{i32 1, i32 2}
+!4 = !{}
+!5 = !{i16 8, i16 1}
+!6 = !{i32 4}