Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1087,7 +1087,7 @@
            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
         if (ContainingLoop) {
-          MachineBasicBlock *TBB = ContainingLoop->getTopBlock();
+          MachineBasicBlock *TBB = ContainingLoop->getHeader();
           BlockWaitcntBrackets *ScoreBracket =
               BlockWaitcntBracketsMap[TBB].get();
           if (!ScoreBracket) {
@@ -1097,7 +1097,7 @@
           }
           ScoreBracket->setRevisitLoop(true);
           DEBUG(dbgs() << "set-revisit: block"
-                       << ContainingLoop->getTopBlock()->getNumber() << '\n';);
+                       << ContainingLoop->getHeader()->getNumber() << '\n';);
         }
       }
 
@@ -1758,12 +1758,12 @@
     // If we are walking into the block from before the loop, then guarantee
     // at least 1 re-walk over the loop to propagate the information, even if
     // no S_WAITCNT instructions were generated.
-    if (ContainingLoop && ContainingLoop->getTopBlock() == &MBB && J < I &&
+    if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
         (BlockWaitcntProcessedSet.find(&MBB) ==
          BlockWaitcntProcessedSet.end())) {
       BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
       DEBUG(dbgs() << "set-revisit: block"
-                   << ContainingLoop->getTopBlock()->getNumber() << '\n';);
+                   << ContainingLoop->getHeader()->getNumber() << '\n';);
     }
 
     // Walk over the instructions.
@@ -1774,7 +1774,7 @@
 
     // See if we want to revisit the loop.
     if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
-      MachineBasicBlock *EntryBB = ContainingLoop->getTopBlock();
+      MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
       if (EntrySB && EntrySB->getRevisitLoop()) {
         EntrySB->setRevisitLoop(false);
Index: test/CodeGen/AMDGPU/waitcnt-looptest.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/waitcnt-looptest.ll
@@ -0,0 +1,110 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=DEFAULT %s
+
+; Check that the waitcnt insertion algorithm correctly propagates wait counts
+; from before a loop to the loop header.
+
+; DEFAULT-LABEL: {{^}}testKernel
+; DEFAULT: s_waitcnt vmcnt(4)
+; DEFAULT-NEXT: v_cmp_o_f64_e64
+; DEFAULT: s_waitcnt vmcnt(4)
+; DEFAULT-NEXT: v_cmp_o_f64_e64
+; DEFAULT: s_waitcnt vmcnt(4)
+; DEFAULT-NEXT: v_cmp_o_f64_e64
+; DEFAULT: s_waitcnt vmcnt(4)
+; DEFAULT-NEXT: v_cmp_o_f64_e64
+; DEFAULT: s_waitcnt vmcnt(3)
+; DEFAULT-NEXT: v_cmp_o_f64_e64
+; DEFAULT: s_waitcnt vmcnt(2)
+; DEFAULT-NEXT: v_cmp_o_f64_e64
+; DEFAULT: s_waitcnt vmcnt(1)
+; DEFAULT-NEXT: v_cmp_o_f64_e64
+; DEFAULT: s_waitcnt vmcnt(0)
+; DEFAULT-NEXT: v_cmp_o_f64_e64
+; DEFAULT: s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT: v_mov_b32_e32
+
+; ModuleID = 'testfile.opt.bc'
+source_filename = "llvm-link"
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "amdgcn-amd-amdhsa-opencl"
+
+@0 = internal addrspace(1) global <16 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, align 128
+
+; Function Attrs: nounwind
+define amdgpu_kernel void @testKernel(i32 addrspace(1)* nocapture) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 {
+  %2 = tail call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #3
+  %3 = tail call i32 @llvm.amdgcn.workitem.id.x() #3
+  %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #3
+  %5 = getelementptr inbounds i8, i8 addrspace(2)* %2, i64 4
+  %6 = bitcast i8 addrspace(2)* %5 to i16 addrspace(2)*
+  %7 = load i16, i16 addrspace(2)* %6, align 4, !tbaa !10
+  %8 = zext i16 %7 to i32
+  %9 = mul i32 %4, %8
+  %10 = add i32 %9, %3
+  %11 = tail call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #3
+  %12 = zext i32 %10 to i64
+  %13 = bitcast i8 addrspace(2)* %11 to i64 addrspace(2)*
+  %14 = load i64, i64 addrspace(2)* %13, align 8, !tbaa !17
+  %15 = add i64 %14, %12
+  %16 = load <16 x double>, <16 x double> addrspace(1)* @0, align 128, !tbaa !2
+  %17 = fcmp ord <16 x double> %16, zeroinitializer
+  %18 = sext <16 x i1> %17 to <16 x i64>
+  %19 = shufflevector <16 x i64> %18, <16 x i64> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %20 = shufflevector <16 x i64> %18, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %21 = and <8 x i64> %19, %20
+  %22 = shufflevector <8 x i64> %21, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %23 = shufflevector <8 x i64> %21, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %24 = and <4 x i64> %22, %23
+  %25 = shufflevector <4 x i64> %24, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %26 = shufflevector <4 x i64> %24, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %27 = and <2 x i64> %25, %26
+  %28 = extractelement <2 x i64> %27, i64 0
+  %29 = extractelement <2 x i64> %27, i64 1
+  %30 = and i64 %28, %29
+  %.lobit = lshr i64 %30, 63
+  %31 = trunc i64 %.lobit to i32
+  %32 = and i64 %15, 4294967295
+  %33 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %32
+  store i32 %31, i32 addrspace(1)* %33, align 4, !tbaa !18
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #3
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #3
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workgroup.id.x() #3
+
+; Function Attrs: nounwind readnone
+declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #3
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "target-features"="+16-bit-insts,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "target-features"="+16-bit-insts,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "target-features"="+16-bit-insts,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind readnone }
+
+!opencl.ocl.version = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 2, i32 0}
+!1 = !{!"clang version 4.0 "}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{i32 1}
+!6 = !{!"none"}
+!7 = !{!"uint*"}
+!8 = !{!""}
+!9 = !{!"results"}
+!10 = !{!11, !12, i64 4}
+!11 = !{!"hsa_kernel_dispatch_packet_s", !12, i64 0, !12, i64 2, !12, i64 4, !12, i64 6, !12, i64 8, !12, i64 10, !13, i64 12, !13, i64 16, !13, i64 20, !13, i64 24, !13, i64 28, !14, i64 32, !15, i64 40, !14, i64 48, !16, i64 56}
+!12 = !{!"short", !3, i64 0}
+!13 = !{!"int", !3, i64 0}
+!14 = !{!"long", !3, i64 0}
+!15 = !{!"any pointer", !3, i64 0}
+!16 = !{!"hsa_signal_s", !14, i64 0}
+!17 = !{!14, !14, i64 0}
+!18 = !{!13, !13, i64 0}