Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1558,7 +1558,8 @@
 MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
   MachineBasicBlock *Bottom = Loop->getHeader();
   for (MachineBasicBlock *MBB : Loop->blocks())
-    if (MBB->getNumber() > Bottom->getNumber())
+    if (MBB->getNumber() > Bottom->getNumber() &&
+        MBB->isSuccessor(Loop->getHeader()))
       Bottom = MBB;
   return Bottom;
 }
Index: test/CodeGen/AMDGPU/waitcnt-looptest.ll
===================================================================
--- test/CodeGen/AMDGPU/waitcnt-looptest.ll
+++ test/CodeGen/AMDGPU/waitcnt-looptest.ll
@@ -144,3 +144,74 @@
 
 attributes #0 = { "target-cpu"="fiji" "target-features"="-flat-for-global" }
 attributes #1 = { nounwind readnone speculatable }
+
+; Check that the waitcnt insertion algorithm correctly propagates wait counts
+; from bottom loop to the loop header.
+
+; GCN-LABEL: {{^}}testLoopBottom
+; GCN: BB1_2:
+; GCN: s_waitcnt vmcnt(0)
+
+define amdgpu_kernel void @testLoopBottom([0 x i8] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x i8] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32, i32, i32, i32) {
+main_body:
+  %8 = getelementptr [0 x i8], [0 x i8] addrspace(4)* %1, i64 0, i64 48
+  %9 = bitcast i8 addrspace(4)* %8 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0
+  %10 = load <4 x i32>, <4 x i32> addrspace(4)* %9, align 16, !invariant.load !0
+  %11 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %10, i32 0, i32 0, i1 false, i1 false)
+  %12 = bitcast [0 x i8] addrspace(4)* %0 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0
+  %13 = load <4 x i32>, <4 x i32> addrspace(4)* %12, align 16, !invariant.load !0
+  %14 = call float @llvm.SI.load.const.v4i32(<4 x i32> %13, i32 64)
+  %15 = icmp eq i32 %2, 3
+  %16 = select i1 %15, i32 1065353216, i32 -1082130432
+  br label %17
+
+; <label>:17:                                     ; preds = %31, %main_body
+  %.in = phi float [ %14, %main_body ], [ %36, %31 ]
+  %18 = phi i32 [ 0, %main_body ], [ %34, %31 ]
+  %19 = phi float [ %11, %main_body ], [ %32, %31 ]
+  %20 = phi float [ %11, %main_body ], [ %33, %31 ]
+  %21 = fptosi float %.in to i32
+  %22 = icmp eq i32 %21, 0
+  br i1 %22, label %23, label %27
+
+; <label>:23:                                     ; preds = %17, %27
+  %24 = bitcast i32 %16 to float
+  %25 = fmul float %14, %24
+  %26 = fadd float %19, %25
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %26, float undef, float undef, float undef, i1 true, i1 false) #3
+  ret void
+
+; <label>:27:                                     ; preds = %17
+  %28 = icmp eq i32 %21, 9
+  %29 = icmp eq i32 %21, 8
+  %30 = icmp ne i32 %21, 9
+  br i1 %28, label %37, label %31
+
+; <label>:32:                                     ; preds = %27, %37
+  %32 = phi float [ %39, %37 ], [ %19, %27 ]
+  %33 = phi float [ %39, %37 ], [ %20, %27 ]
+  %34 = zext i1 %29 to i32
+  %35 = load <4 x i32>, <4 x i32> addrspace(4)* %9, align 16, !invariant.load !0
+  %36 = call float @llvm.SI.load.const.v4i32(<4 x i32> %35, i32 %34)
+  br label %17
+
+; <label>:38:                                     ; preds = %28
+  %38 = load <4 x i32>, <4 x i32> addrspace(4)* %9, align 16, !invariant.load !0
+  %39 = call float @llvm.SI.load.const.v4i32(<4 x i32> %38, i32 0)
+  br label %31
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
+
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #2
+
+; Function Attrs: nounwind readonly
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
+
+attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!0 = !{}