Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1558,7 +1558,8 @@ MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) { MachineBasicBlock *Bottom = Loop->getHeader(); for (MachineBasicBlock *MBB : Loop->blocks()) - if (MBB->getNumber() > Bottom->getNumber()) + if (MBB->getNumber() > Bottom->getNumber() && + MBB->isSuccessor(Loop->getHeader())) Bottom = MBB; return Bottom; } Index: test/CodeGen/AMDGPU/waitcnt-looptest.ll =================================================================== --- test/CodeGen/AMDGPU/waitcnt-looptest.ll +++ test/CodeGen/AMDGPU/waitcnt-looptest.ll @@ -144,3 +144,122 @@ attributes #0 = { "target-cpu"="fiji" "target-features"="-flat-for-global" } attributes #1 = { nounwind readnone speculatable } + +; Check that the waitcnt insertion algorithm correctly propagates wait counts +; from bottom loop to the loop header. + +; GCN-LABEL: {{^}}testLoopBottom +; GCN: BB1_2: +; GCN: s_waitcnt vmcnt(0) + +define amdgpu_kernel void @testLoopBottom([0 x i8] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x i8] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32, i32, i32, i32) { +main_body: + %8 = add i32 %4, %2 + %9 = and i32 %8, 3 + %10 = lshr i32 %8, 2 + %11 = getelementptr [0 x i8], [0 x i8] addrspace(4)* %1, i64 0, i64 48 + %12 = bitcast i8 addrspace(4)* %11 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0 + %13 = shl i32 %10, 5 + %14 = load <4 x i32>, <4 x i32> addrspace(4)* %12, align 16, !invariant.load !0 + %15 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %14, i32 0, i32 %13, i1 false, i1 false) + %16 = bitcast [0 x i8] addrspace(4)* %0 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0 + %17 = load <4 x i32>, <4 x i32> addrspace(4)* %16, align 16, !invariant.load !0 + %18 = call float @llvm.SI.load.const.v4i32(<4 x i32> %17, i32 64) + %19 = call float @llvm.SI.load.const.v4i32(<4 x i32> %17, i32 80) + %20 = icmp ult i32 %9, 2 + %21 = icmp eq i32 %9, 0 + %22 = select i1 %21, i32 1065353216, i32 -1082130432 + %23 = icmp eq i32 %9, 3 + %24 = select i1 %23, i32 1065353216, i32 -1082130432 + %25 = select i1 %20, i32 %22, i32 %24 + %26 = select i1 %20, i32 1065353216, i32 -1082130432 + %27 = bitcast i32 %25 to float + %28 = fmul float %27, 5.000000e-01 + %29 = bitcast i32 %26 to float + %30 = fmul float %29, -5.000000e-01 + %31 = fadd float %28, 5.000000e-01 + %32 = fadd float %30, 5.000000e-01 + %33 = getelementptr [0 x i8], [0 x i8] addrspace(4)* %1, i64 0, i64 16 + %34 = bitcast i8 addrspace(4)* %33 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0 + %35 = load <4 x i32>, <4 x i32> addrspace(4)* %34, align 16, !invariant.load !0 + %36 = call float @llvm.SI.load.const.v4i32(<4 x i32> %35, i32 0) + br label %37 + +;