Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1558,7 +1558,8 @@
 MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
   MachineBasicBlock *Bottom = Loop->getHeader();
   for (MachineBasicBlock *MBB : Loop->blocks())
-    if (MBB->getNumber() > Bottom->getNumber())
+    if (MBB->getNumber() > Bottom->getNumber() &&
+        MBB->isSuccessor(Loop->getHeader()))
       Bottom = MBB;
   return Bottom;
 }
Index: test/CodeGen/AMDGPU/waitcnt-looptest.ll
===================================================================
--- test/CodeGen/AMDGPU/waitcnt-looptest.ll
+++ test/CodeGen/AMDGPU/waitcnt-looptest.ll
@@ -144,3 +144,122 @@
 
 attributes #0 = { "target-cpu"="fiji" "target-features"="-flat-for-global" }
 attributes #1 = { nounwind readnone speculatable }
+
+; Check that the waitcnt insertion algorithm correctly propagates wait counts
+; from bottom loop to the loop header.
+
+; GCN-LABEL: {{^}}testLoopBottom
+; GCN: BB1_2:
+; GCN: s_waitcnt vmcnt(0)
+
+define amdgpu_kernel void @testLoopBottom([0 x i8] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x i8] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32, i32, i32, i32) {
+main_body:
+  %8 = add i32 %4, %2
+  %9 = and i32 %8, 3
+  %10 = lshr i32 %8, 2
+  %11 = getelementptr [0 x i8], [0 x i8] addrspace(4)* %1, i64 0, i64 48
+  %12 = bitcast i8 addrspace(4)* %11 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0
+  %13 = shl i32 %10, 5
+  %14 = load <4 x i32>, <4 x i32> addrspace(4)* %12, align 16, !invariant.load !0
+  %15 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %14, i32 0, i32 %13, i1 false, i1 false)
+  %16 = bitcast [0 x i8] addrspace(4)* %0 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0
+  %17 = load <4 x i32>, <4 x i32> addrspace(4)* %16, align 16, !invariant.load !0
+  %18 = call float @llvm.SI.load.const.v4i32(<4 x i32> %17, i32 64)
+  %19 = call float @llvm.SI.load.const.v4i32(<4 x i32> %17, i32 80)
+  %20 = icmp ult i32 %9, 2
+  %21 = icmp eq i32 %9, 0
+  %22 = select i1 %21, i32 1065353216, i32 -1082130432
+  %23 = icmp eq i32 %9, 3
+  %24 = select i1 %23, i32 1065353216, i32 -1082130432
+  %25 = select i1 %20, i32 %22, i32 %24
+  %26 = select i1 %20, i32 1065353216, i32 -1082130432
+  %27 = bitcast i32 %25 to float
+  %28 = fmul float %27, 5.000000e-01
+  %29 = bitcast i32 %26 to float
+  %30 = fmul float %29, -5.000000e-01
+  %31 = fadd float %28, 5.000000e-01
+  %32 = fadd float %30, 5.000000e-01
+  %33 = getelementptr [0 x i8], [0 x i8] addrspace(4)* %1, i64 0, i64 16
+  %34 = bitcast i8 addrspace(4)* %33 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0
+  %35 = load <4 x i32>, <4 x i32> addrspace(4)* %34, align 16, !invariant.load !0
+  %36 = call float @llvm.SI.load.const.v4i32(<4 x i32> %35, i32 0)
+  br label %37
+
+; <label>:37:                                     ; preds = %57, %main_body
+  %.in = phi float [ %36, %main_body ], [ %66, %57 ]
+  %38 = phi i32 [ 0, %main_body ], [ %63, %57 ]
+  %39 = phi float [ %15, %main_body ], [ %58, %57 ]
+  %40 = phi float [ %15, %main_body ], [ %59, %57 ]
+  %41 = fptosi float %.in to i32
+  %42 = icmp eq i32 %41, 0
+  br i1 %42, label %43, label %50
+
+; <label>:43:                                     ; preds = %37, %50
+  %44 = bitcast i32 %25 to float
+  %45 = fmul float %18, %44
+  %46 = bitcast i32 %26 to float
+  %47 = fmul float %19, %46
+  %48 = fadd float %45, %47
+  %49 = fadd float %39, %48
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %49, float undef, float undef, float undef, i1 true, i1 false) #3
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %31, float %32, float undef, float 0.000000e+00, i1 false, i1 false) #3
+  ret void
+
+; <label>:50:                                     ; preds = %37
+  %51 = icmp sgt i32 %38, 895
+  br i1 %51, label %43, label %52
+
+; <label>:52:                                     ; preds = %50
+  %53 = icmp eq i32 %41, 9
+  %54 = icmp eq i32 %41, 8
+  %55 = or i32 %41, 1
+  %56 = icmp ne i32 %55, 9
+  br i1 %53, label %67, label %57
+
+; <label>:57:                                     ; preds = %52, %67
+  %58 = phi float [ %77, %67 ], [ %39, %52 ]
+  %59 = phi float [ %77, %67 ], [ %40, %52 ]
+  %60 = zext i1 %54 to i32
+  %61 = add i32 %38, %60
+  %62 = zext i1 %56 to i32
+  %63 = add i32 %61, %62
+  %64 = shl i32 %63, 4
+  %65 = load <4 x i32>, <4 x i32> addrspace(4)* %34, align 16, !invariant.load !0
+  %66 = call float @llvm.SI.load.const.v4i32(<4 x i32> %65, i32 %64)
+  br label %37
+
+; <label>:67:                                     ; preds = %52
+  %68 = shl i32 %38, 4
+  %69 = load <4 x i32>, <4 x i32> addrspace(4)* %34, align 16, !invariant.load !0
+  %70 = or i32 %68, 4
+  %71 = call float @llvm.SI.load.const.v4i32(<4 x i32> %69, i32 %70)
+  %72 = add nsw i32 %70, 4
+  %73 = call float @llvm.SI.load.const.v4i32(<4 x i32> %69, i32 %72)
+  %74 = fmul float %18, %71
+  %75 = fmul float %19, %73
+  %76 = fadd float %74, %75
+  %77 = fadd float %40, %76
+  br label %57
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #2
+
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #3
+
+; Function Attrs: nounwind readonly
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!0 = !{}