Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1087,7 +1087,7 @@
            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
         if (ContainingLoop) {
-          MachineBasicBlock *TBB = ContainingLoop->getTopBlock();
+          MachineBasicBlock *TBB = ContainingLoop->getHeader();
           BlockWaitcntBrackets *ScoreBracket =
               BlockWaitcntBracketsMap[TBB].get();
           if (!ScoreBracket) {
@@ -1097,7 +1097,7 @@
           }
           ScoreBracket->setRevisitLoop(true);
           DEBUG(dbgs() << "set-revisit: block"
-                       << ContainingLoop->getTopBlock()->getNumber() << '\n';);
+                       << ContainingLoop->getHeader()->getNumber() << '\n';);
         }
       }
 
@@ -1758,12 +1758,12 @@
     // If we are walking into the block from before the loop, then guarantee
     // at least 1 re-walk over the loop to propagate the information, even if
     // no S_WAITCNT instructions were generated.
-    if (ContainingLoop && ContainingLoop->getTopBlock() == &MBB && J < I &&
+    if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
         (BlockWaitcntProcessedSet.find(&MBB) ==
          BlockWaitcntProcessedSet.end())) {
       BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
       DEBUG(dbgs() << "set-revisit: block"
-                   << ContainingLoop->getTopBlock()->getNumber() << '\n';);
+                   << ContainingLoop->getHeader()->getNumber() << '\n';);
     }
 
     // Walk over the instructions.
@@ -1774,7 +1774,7 @@
 
     // See if we want to revisit the loop.
     if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
-      MachineBasicBlock *EntryBB = ContainingLoop->getTopBlock();
+      MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
       if (EntrySB && EntrySB->getRevisitLoop()) {
         EntrySB->setRevisitLoop(false);
Index: test/CodeGen/AMDGPU/waitcnt-looptest.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/waitcnt-looptest.ll
@@ -0,0 +1,171 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=GCN %s
+
+; Check that the waitcnt insertion algorithm correctly propagates wait counts
+; from before a loop to the loop header.
+
+; GCN-LABEL: {{^}}testKernel
+; GCN: BB0_1:
+; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_f32_e64
+; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_f32_e32
+; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_f32_e32
+
+@data_generic = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4
+@data_reference = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4
+
+define amdgpu_kernel void @testKernel(i32 addrspace(1)* nocapture %arg) local_unnamed_addr #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 !kernel_arg_name !6 {
+bb:
+  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float> addrspace(4)* bitcast (float addrspace(4)* getelementptr ([100 x float], [100 x float] addrspace(4)* addrspacecast ([100 x float] addrspace(1)* @data_generic to [100 x float] addrspace(4)*), i64 0, i64 4) to <2 x float> addrspace(4)*), align 4, !tbaa !7
+  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float> addrspace(4)* bitcast (float addrspace(4)* getelementptr ([100 x float], [100 x float] addrspace(4)* addrspacecast ([100 x float] addrspace(1)* @data_reference to [100 x float] addrspace(4)*), i64 0, i64 4) to <2 x float> addrspace(4)*), align 4, !tbaa !7
+  br label %bb18
+
+bb1:                                              ; preds = %bb18
+  %tmp = tail call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp3 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %tmp4 = getelementptr inbounds i8, i8 addrspace(2)* %tmp, i64 4
+  %tmp5 = bitcast i8 addrspace(2)* %tmp4 to i16 addrspace(2)*
+  %tmp6 = load i16, i16 addrspace(2)* %tmp5, align 4, !tbaa !10
+  %tmp7 = zext i16 %tmp6 to i32
+  %tmp8 = mul i32 %tmp3, %tmp7
+  %tmp9 = add i32 %tmp8, %tmp2
+  %tmp10 = tail call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  %tmp11 = zext i32 %tmp9 to i64
+  %tmp12 = bitcast i8 addrspace(2)* %tmp10 to i64 addrspace(2)*
+  %tmp13 = load i64, i64 addrspace(2)* %tmp12, align 8, !tbaa !17
+  %tmp14 = add i64 %tmp13, %tmp11
+  %tmp15 = zext i1 %tmp99 to i32
+  %tmp16 = and i64 %tmp14, 4294967295
+  %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
+  store i32 %tmp15, i32 addrspace(1)* %tmp17, align 4, !tbaa !18
+  ret void
+
+bb18:                                             ; preds = %bb18, %bb
+  %tmp19 = phi i64 [ 0, %bb ], [ %tmp102, %bb18 ]
+  %tmp20 = phi i32 [ 0, %bb ], [ %tmp100, %bb18 ]
+  %tmp21 = phi i1 [ true, %bb ], [ %tmp99, %bb18 ]
+  %tmp22 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp19
+  %tmp23 = load float, float addrspace(1)* %tmp22, align 4, !tbaa !19
+  %tmp24 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp19
+  %tmp25 = load float, float addrspace(1)* %tmp24, align 4, !tbaa !19
+  %tmp26 = fcmp oeq float %tmp23, %tmp25
+  %tmp27 = and i1 %tmp21, %tmp26
+  %tmp28 = or i32 %tmp20, 1
+  %tmp29 = sext i32 %tmp28 to i64
+  %tmp30 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp29
+  %tmp31 = load float, float addrspace(1)* %tmp30, align 4, !tbaa !19
+  %tmp32 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp29
+  %tmp33 = load float, float addrspace(1)* %tmp32, align 4, !tbaa !19
+  %tmp34 = fcmp oeq float %tmp31, %tmp33
+  %tmp35 = and i1 %tmp27, %tmp34
+  %tmp36 = add nuw nsw i32 %tmp20, 2
+  %tmp37 = sext i32 %tmp36 to i64
+  %tmp38 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp37
+  %tmp39 = load float, float addrspace(1)* %tmp38, align 4, !tbaa !19
+  %tmp40 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp37
+  %tmp41 = load float, float addrspace(1)* %tmp40, align 4, !tbaa !19
+  %tmp42 = fcmp oeq float %tmp39, %tmp41
+  %tmp43 = and i1 %tmp35, %tmp42
+  %tmp44 = add nuw nsw i32 %tmp20, 3
+  %tmp45 = sext i32 %tmp44 to i64
+  %tmp46 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp45
+  %tmp47 = load float, float addrspace(1)* %tmp46, align 4, !tbaa !19
+  %tmp48 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp45
+  %tmp49 = load float, float addrspace(1)* %tmp48, align 4, !tbaa !19
+  %tmp50 = fcmp oeq float %tmp47, %tmp49
+  %tmp51 = and i1 %tmp43, %tmp50
+  %tmp52 = add nuw nsw i32 %tmp20, 4
+  %tmp53 = sext i32 %tmp52 to i64
+  %tmp54 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp53
+  %tmp55 = load float, float addrspace(1)* %tmp54, align 4, !tbaa !19
+  %tmp56 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp53
+  %tmp57 = load float, float addrspace(1)* %tmp56, align 4, !tbaa !19
+  %tmp58 = fcmp oeq float %tmp55, %tmp57
+  %tmp59 = and i1 %tmp51, %tmp58
+  %tmp60 = add nuw nsw i32 %tmp20, 5
+  %tmp61 = sext i32 %tmp60 to i64
+  %tmp62 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp61
+  %tmp63 = load float, float addrspace(1)* %tmp62, align 4, !tbaa !19
+  %tmp64 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp61
+  %tmp65 = load float, float addrspace(1)* %tmp64, align 4, !tbaa !19
+  %tmp66 = fcmp oeq float %tmp63, %tmp65
+  %tmp67 = and i1 %tmp59, %tmp66
+  %tmp68 = add nuw nsw i32 %tmp20, 6
+  %tmp69 = sext i32 %tmp68 to i64
+  %tmp70 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp69
+  %tmp71 = load float, float addrspace(1)* %tmp70, align 4, !tbaa !19
+  %tmp72 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp69
+  %tmp73 = load float, float addrspace(1)* %tmp72, align 4, !tbaa !19
+  %tmp74 = fcmp oeq float %tmp71, %tmp73
+  %tmp75 = and i1 %tmp67, %tmp74
+  %tmp76 = add nuw nsw i32 %tmp20, 7
+  %tmp77 = sext i32 %tmp76 to i64
+  %tmp78 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp77
+  %tmp79 = load float, float addrspace(1)* %tmp78, align 4, !tbaa !19
+  %tmp80 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp77
+  %tmp81 = load float, float addrspace(1)* %tmp80, align 4, !tbaa !19
+  %tmp82 = fcmp oeq float %tmp79, %tmp81
+  %tmp83 = and i1 %tmp75, %tmp82
+  %tmp84 = add nuw nsw i32 %tmp20, 8
+  %tmp85 = sext i32 %tmp84 to i64
+  %tmp86 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp85
+  %tmp87 = load float, float addrspace(1)* %tmp86, align 4, !tbaa !19
+  %tmp88 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp85
+  %tmp89 = load float, float addrspace(1)* %tmp88, align 4, !tbaa !19
+  %tmp90 = fcmp oeq float %tmp87, %tmp89
+  %tmp91 = and i1 %tmp83, %tmp90
+  %tmp92 = add nuw nsw i32 %tmp20, 9
+  %tmp93 = sext i32 %tmp92 to i64
+  %tmp94 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp93
+  %tmp95 = load float, float addrspace(1)* %tmp94, align 4, !tbaa !19
+  %tmp96 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp93
+  %tmp97 = load float, float addrspace(1)* %tmp96, align 4, !tbaa !19
+  %tmp98 = fcmp oeq float %tmp95, %tmp97
+  %tmp99 = and i1 %tmp91, %tmp98
+  %tmp100 = add nuw nsw i32 %tmp20, 10
+  %tmp101 = icmp eq i32 %tmp100, 100
+  %tmp102 = sext i32 %tmp100 to i64
+  br i1 %tmp101, label %bb1, label %bb18
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1
+
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: nounwind readnone speculatable
+declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #1
+
+attributes #0 = { "target-cpu"="fiji" "target-features"="-flat-for-global" }
+attributes #1 = { nounwind readnone speculatable }
+
+!opencl.ocl.version = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 2, i32 0}
+!1 = !{!"clang version 4.0 "}
+!2 = !{i32 1}
+!3 = !{!"none"}
+!4 = !{!"uint*"}
+!5 = !{!""}
+!6 = !{!"results"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+!10 = !{!11, !12, i64 4}
+!11 = !{!"hsa_kernel_dispatch_packet_s", !12, i64 0, !12, i64 2, !12, i64 4, !12, i64 6, !12, i64 8, !12, i64 10, !13, i64 12, !13, i64 16, !13, i64 20, !13, i64 24, !13, i64 28, !14, i64 32, !15, i64 40, !14, i64 48, !16, i64 56}
+!12 = !{!"short", !8, i64 0}
+!13 = !{!"int", !8, i64 0}
+!14 = !{!"long", !8, i64 0}
+!15 = !{!"any pointer", !8, i64 0}
+!16 = !{!"hsa_signal_s", !14, i64 0}
+!17 = !{!14, !14, i64 0}
+!18 = !{!13, !13, i64 0}
+!19 = !{!20, !20, i64 0}
+!20 = !{!"float", !8, i64 0}