Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -524,6 +524,16 @@ } } +/// Return true if \p MBB has one successor immediately following, and is its +/// only predecessor +static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) { + if (MBB.succ_size() != 1) + return false; + + const MachineBasicBlock *Succ = *MBB.succ_begin(); + return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ); +} + // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { @@ -642,8 +652,10 @@ EndPgmBlocks.push_back(&MBB); } - // Wait for everything at the end of the MBB - Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + // Wait for everything at the end of the MBB. If there is only one + // successor, we can defer this until the uses there. + if (!hasTrivialSuccessor(MBB)) + Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } if (HaveScalarStores) { Index: test/CodeGen/AMDGPU/waitcnt.mir =================================================================== --- test/CodeGen/AMDGPU/waitcnt.mir +++ test/CodeGen/AMDGPU/waitcnt.mir @@ -7,6 +7,15 @@ <4 x i32> addrspace(4)* %flat16) { ret void } + + define void @single_fallthrough_successor_no_end_block_wait() { + ret void + } + + define void @single_branch_successor_not_next_block() { + ret void + } + ... --- @@ -21,18 +30,21 @@ # CHECK-LABEL: bb.1: # CHECK: FLAT_LOAD_DWORD +# CHECK: S_WAITCNT 3952 # CHECK: FLAT_LOAD_DWORDX4 # The first load has no mem operand, so we should assume it accesses the flat # address space. # s_waitcnt vmcnt(0) lgkmcnt(0) -# CHECK-NEXT: S_WAITCNT 112 +# CHECK-NEXT: S_WAITCNT 127 # CHECK-LABEL: bb.2: # CHECK: FLAT_LOAD_DWORD +# CHECK: S_WAITCNT 3952 # CHECK: FLAT_LOAD_DWORDX4 + # One outstand loads access the flat address space. # s_waitcnt vmcnt(0) lgkmcnt(0) -# CHECK-NEXT: S_WAITCNT 112 +# CHECK-NEXT: S_WAITCNT 127 name: flat_zero_waitcnt @@ -57,3 +69,60 @@ %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec S_ENDPGM ... +--- +# There is only a single fallthrough successor block, so there's no +# need to wait immediately. + +# CHECK-LABEL: name: single_fallthrough_successor_no_end_block_wait +# CHECK: %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2 +# CHECK-NOT: S_WAITCNT + +# CHECK: bb.1: +# CHECK-NEXT: V_LSHLREV_B64 +# CHECK-NEXT: S_WAITCNT 112 +# CHECK-NEXT: FLAT_STORE_DWORD +name: single_fallthrough_successor_no_end_block_wait + +body: | + bb.0: + successors: %bb.1 + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr + + bb.1: + %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec + FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# The block has a single predecessor with a single successor, but it +# is not the next block so it's non-obvious that the wait is not needed. + + +# CHECK-LABEL: name: single_branch_successor_not_next_block +# CHECK: %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2 +# CHECK-NEXT: S_WAITCNT 112 + +# CHECK: bb.1 +# CHECK-NEXT: FLAT_STORE_DWORD +# CHECK-NEXT: S_ENDPGM + +# CHECK: bb.2: +# CHECK-NEXT: V_LSHLREV_B64 +# CHECK-NEXT: FLAT_STORE_DWORD +name: single_branch_successor_not_next_block + +body: | + bb.0: + successors: %bb.2 + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr + S_BRANCH %bb.2 + + bb.1: + FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM + + bb.2: + %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec + FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +...